wimlib.net Git - wimlib/blob - src/lzms_compress.c

   1 /*
   2  * lzms_compress.c
   3  *
   4  * A compressor for the LZMS compression format.
   5  */
   6
   7 /*
   8  * Copyright (C) 2013, 2014, 2015 Eric Biggers
   9  *
  10  * This file is free software; you can redistribute it and/or modify it under
  11  * the terms of the GNU Lesser General Public License as published by the Free
  12  * Software Foundation; either version 3 of the License, or (at your option) any
  13  * later version.
  14  *
  15  * This file is distributed in the hope that it will be useful, but WITHOUT
  16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  18  * details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this file; if not, see http://www.gnu.org/licenses/.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #  include "config.h"
  26 #endif
  27
  28 #include <limits.h>
  29 #include <string.h>
  30
  31 #include "wimlib/compress_common.h"
  32 #include "wimlib/compressor_ops.h"
  33 #include "wimlib/error.h"
  34 #include "wimlib/lcpit_matchfinder.h"
  35 #include "wimlib/lz_extend.h"
  36 #include "wimlib/lz_hash.h"
  37 #include "wimlib/lzms_common.h"
  38 #include "wimlib/unaligned.h"
  39 #include "wimlib/util.h"
  40
  41 /*
  42  * MAX_FAST_LENGTH is the maximum match length for which the length slot can be
  43  * looked up directly in 'fast_length_slot_tab' and the length cost can be
  44  * looked up directly in 'fast_length_cost_tab'.
  45  *
  46  * We also limit the 'nice_match_len' parameter to this value.  Consequently, if
  47  * the longest match found is shorter than 'nice_match_len', then it must also
  48  * be shorter than MAX_FAST_LENGTH.  This makes it possible to do fast lookups
  49  * of length costs using 'fast_length_cost_tab' without having to keep checking
  50  * whether the length exceeds MAX_FAST_LENGTH or not.
  51  */
  52 #define MAX_FAST_LENGTH         255
  53
  54 /* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will
  55  * step forward before forcing the pending items to be encoded.  If this value
  56  * is increased, then there will be fewer forced flushes, but the probability
  57  * entries and Huffman codes will be more likely to become outdated.  */
  58 #define NUM_OPTIM_NODES         2048
  59
  60 /* COST_SHIFT is a scaling factor that makes it possible to consider fractional
  61  * bit costs.  A single bit has a cost of (1 << COST_SHIFT).  */
  62 #define COST_SHIFT              6
  63
  64 /* Length of the hash table for finding delta matches  */
  65 #define DELTA_HASH_ORDER        17
  66 #define DELTA_HASH_LENGTH       ((u32)1 << DELTA_HASH_ORDER)
  67
  68 /* The number of bytes to hash when finding delta matches; also taken to be the
  69  * minimum length of an explicit offset delta match  */
  70 #define NBYTES_HASHED_FOR_DELTA 3
  71
  72 /* The number of delta match powers to consider (must be <=
  73  * LZMS_NUM_DELTA_POWER_SYMS)  */
  74 #define NUM_POWERS_TO_CONSIDER  6
  75
  76 /* This structure tracks the state of writing bits as a series of 16-bit coding
  77  * units, starting at the end of the output buffer and proceeding backwards.  */
  78 struct lzms_output_bitstream {
  79
  80         /* Bits that haven't yet been written to the output buffer  */
  81         u64 bitbuf;
  82
  83         /* Number of bits currently held in @bitbuf  */
  84         unsigned bitcount;
  85
  86         /* Pointer to one past the next position in the output buffer at which
  87          * to output a 16-bit coding unit  */
  88         le16 *next;
  89
  90         /* Pointer to the beginning of the output buffer (this is the "end" when
  91          * writing backwards!)  */
  92         le16 *begin;
  93 };
  94
  95 /* This structure tracks the state of range encoding and its output, which
  96  * starts at the beginning of the output buffer and proceeds forwards.  */
  97 struct lzms_range_encoder {
  98
  99         /* The lower boundary of the current range.  Logically, this is a 33-bit
 100          * integer whose high bit is needed to detect carries.  */
 101         u64 lower_bound;
 102
 103         /* The size of the current range  */
 104         u32 range_size;
 105
 106         /* The next 16-bit coding unit to output  */
 107         u16 cache;
 108
 109         /* The number of 16-bit coding units whose output has been delayed due
 110          * to possible carrying.  The first such coding unit is @cache; all
 111          * subsequent such coding units are 0xffff.  */
 112         u32 cache_size;
 113
 114         /* Pointer to the beginning of the output buffer  */
 115         le16 *begin;
 116
 117         /* Pointer to the position in the output buffer at which the next coding
 118          * unit must be written  */
 119         le16 *next;
 120
 121         /* Pointer to just past the end of the output buffer  */
 122         le16 *end;
 123 };
 124
 125 /* Bookkeeping information for an adaptive Huffman code  */
 126 struct lzms_huffman_rebuild_info {
 127
 128         /* The remaining number of symbols to encode until this code must be
 129          * rebuilt  */
 130         unsigned num_syms_until_rebuild;
 131
 132         /* The number of symbols in this code  */
 133         unsigned num_syms;
 134
 135         /* The rebuild frequency of this code, in symbols  */
 136         unsigned rebuild_freq;
 137
 138         /* The Huffman codeword of each symbol in this code  */
 139         u32 *codewords;
 140
 141         /* The length of each Huffman codeword, in bits  */
 142         u8 *lens;
 143
 144         /* The frequency of each symbol in this code  */
 145         u32 *freqs;
 146 };
 147
 148 /*
 149  * The compressor-internal representation of a match or literal.
 150  *
 151  * Literals have length=1; matches have length > 1.  (We disallow matches of
 152  * length 1, even though this is a valid length in LZMS.)
 153  *
 154  * The source is encoded as follows:
 155  *
 156  * - Literals: the literal byte itself
 157  * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1)
 158  * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets
 159  * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are
 160  *   the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1)
 161  * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is
 162  *   the index of the (power, raw_offset) pair in recent_delta_pairs
 163  */
 164 struct lzms_item {
 165         u32 length;
 166         u32 source;
 167 };
 168
 169 #define DELTA_SOURCE_TAG                ((u32)1 << 31)
 170 #define DELTA_SOURCE_POWER_SHIFT        28
 171 #define DELTA_SOURCE_RAW_OFFSET_MASK    (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1)
 172
 173 static inline void
 174 check_that_powers_fit_in_bitfield(void)
 175 {
 176         STATIC_ASSERT(LZMS_NUM_DELTA_POWER_SYMS <= (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
 177 }
 178
 179 /* A stripped-down version of the adaptive state in LZMS which excludes the
 180  * probability entries and Huffman codes  */
 181 struct lzms_adaptive_state {
 182
 183         /* Recent offsets for LZ matches  */
 184         u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1];
 185         u32 prev_lz_offset; /* 0 means none */
 186         u32 upcoming_lz_offset; /* 0 means none */
 187
 188         /* Recent (power, raw offset) pairs for delta matches.
 189          * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw
 190          * offset, and the high bits are the power.  */
 191         u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1];
 192         u32 prev_delta_pair; /* 0 means none */
 193         u32 upcoming_delta_pair; /* 0 means none  */
 194
 195         /* States for predicting the probabilities of item types  */
 196         u8 main_state;
 197         u8 match_state;
 198         u8 lz_state;
 199         u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
 200         u8 delta_state;
 201         u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
 202 } _aligned_attribute(64);
 203
 204 /*
 205  * This structure represents a byte position in the preprocessed input data and
 206  * a node in the graph of possible match/literal choices.
 207  *
 208  * Logically, each incoming edge to this node is labeled with a literal or a
 209  * match that can be taken to reach this position from an earlier position; and
 210  * each outgoing edge from this node is labeled with a literal or a match that
 211  * can be taken to advance from this position to a later position.
 212  */
 213 struct lzms_optimum_node {
 214
 215         /*
 216          * The cost of the lowest-cost path that has been found to reach this
 217          * position.  This can change as progressively lower cost paths are
 218          * found to reach this position.
 219          */
 220         u32 cost;
 221 #define INFINITE_COST UINT32_MAX
 222
 223         /*
 224          * @item is the last item that was taken to reach this position to reach
 225          * it with the stored @cost.  This can change as progressively lower
 226          * cost paths are found to reach this position.
 227          *
 228          * In some cases we look ahead more than one item.  If we looked ahead n
 229          * items to reach this position, then @item is the last item taken,
 230          * @extra_items contains the other items ordered from second-to-last to
 231          * first, and @num_extra_items is n - 1.
 232          */
 233         unsigned num_extra_items;
 234         struct lzms_item item;
 235         struct lzms_item extra_items[2];
 236
 237         /*
 238          * The adaptive state that exists at this position.  This is filled in
 239          * lazily, only after the minimum-cost path to this position is found.
 240          *
 241          * Note: the way the algorithm handles this adaptive state in the
 242          * "minimum-cost" parse is actually only an approximation.  It's
 243          * possible for the globally optimal, minimum cost path to contain a
 244          * prefix, ending at a position, where that path prefix is *not* the
 245          * minimum cost path to that position.  This can happen if such a path
 246          * prefix results in a different adaptive state which results in lower
 247          * costs later.  Although the algorithm does do some heuristic
 248          * multi-item lookaheads, it does not solve this problem in general.
 249          *
 250          * Note: this adaptive state structure also does not include the
 251          * probability entries or current Huffman codewords.  Those aren't
 252          * maintained per-position and are only updated occassionally.
 253          */
 254         struct lzms_adaptive_state state;
 255 } _aligned_attribute(64);
 256
 257 /* The main compressor structure  */
 258 struct lzms_compressor {
 259
 260         /* The matchfinder for LZ matches  */
 261         struct lcpit_matchfinder mf;
 262
 263         /* The preprocessed buffer of data being compressed  */
 264         u8 *in_buffer;
 265
 266         /* The number of bytes of data to be compressed, which is the number of
 267          * bytes of data in @in_buffer that are actually valid  */
 268         size_t in_nbytes;
 269
 270         /*
 271          * Boolean flags to enable consideration of various types of multi-step
 272          * operations during parsing.
 273          *
 274          * Among other cases, multi-step operations can help with gaps where two
 275          * matches are separated by a non-matching byte.
 276          *
 277          * This idea is borrowed from Igor Pavlov's LZMA encoder.
 278          */
 279         bool try_lit_lzrep0;
 280         bool try_lzrep_lit_lzrep0;
 281         bool try_lzmatch_lit_lzrep0;
 282
 283         /*
 284          * If true, the compressor can use delta matches.  This slows down
 285          * compression.  It improves the compression ratio greatly, slightly, or
 286          * not at all, depending on the input data.
 287          */
 288         bool use_delta_matches;
 289
 290         /* If true, the compressor need not preserve the input buffer if it
 291          * compresses the data successfully.  */
 292         bool destructive;
 293
 294         /* 'last_target_usages' is a large array that is only needed for
 295          * preprocessing, so it is in union with fields that don't need to be
 296          * initialized until after preprocessing.  */
 297         union {
 298         struct {
 299
 300         /* Temporary space to store matches found by the LZ matchfinder  */
 301         struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1];
 302
 303         /* Hash table for finding delta matches  */
 304         u32 delta_hash_table[DELTA_HASH_LENGTH];
 305
 306         /* For each delta power, the hash code for the next sequence  */
 307         u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
 308
 309         /* The per-byte graph nodes for near-optimal parsing  */
 310         struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH +
 311                                                1 + MAX_FAST_LENGTH];
 312
 313         /* Table: length => current cost for small match lengths  */
 314         u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
 315
 316         /* Range encoder which outputs to the beginning of the compressed data
 317          * buffer, proceeding forwards  */
 318         struct lzms_range_encoder rc;
 319
 320         /* Bitstream which outputs to the end of the compressed data buffer,
 321          * proceeding backwards  */
 322         struct lzms_output_bitstream os;
 323
 324         /* States and probability entries for item type disambiguation  */
 325         unsigned main_state;
 326         unsigned match_state;
 327         unsigned lz_state;
 328         unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
 329         unsigned delta_state;
 330         unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
 331         struct lzms_probabilites probs;
 332
 333         /* Huffman codes  */
 334
 335         struct lzms_huffman_rebuild_info literal_rebuild_info;
 336         u32 literal_codewords[LZMS_NUM_LITERAL_SYMS];
 337         u8 literal_lens[LZMS_NUM_LITERAL_SYMS];
 338         u32 literal_freqs[LZMS_NUM_LITERAL_SYMS];
 339
 340         struct lzms_huffman_rebuild_info lz_offset_rebuild_info;
 341         u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
 342         u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
 343         u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
 344
 345         struct lzms_huffman_rebuild_info length_rebuild_info;
 346         u32 length_codewords[LZMS_NUM_LENGTH_SYMS];
 347         u8 length_lens[LZMS_NUM_LENGTH_SYMS];
 348         u32 length_freqs[LZMS_NUM_LENGTH_SYMS];
 349
 350         struct lzms_huffman_rebuild_info delta_offset_rebuild_info;
 351         u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
 352         u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
 353         u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
 354
 355         struct lzms_huffman_rebuild_info delta_power_rebuild_info;
 356         u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS];
 357         u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS];
 358         u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS];
 359
 360         }; /* struct */
 361
 362         s32 last_target_usages[65536];
 363
 364         }; /* union */
 365
 366         /* Table: length => length slot for small match lengths  */
 367         u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1];
 368
 369         /* Tables for mapping offsets to offset slots  */
 370
 371         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
 372         u8 offset_slot_tab_1[0xe4a5];
 373
 374         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
 375         u16 offset_slot_tab_2[0x3d0000 >> 11];
 376
 377         /* slots [427, 799); 16 <= num_extra_bits  */
 378         u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16];
 379 };
 380
 381 /******************************************************************************
 382  *                   Offset and length slot acceleration                      *
 383  ******************************************************************************/
 384
 385 /* Generate the acceleration table for length slots.  */
 386 static void
 387 lzms_init_fast_length_slot_tab(struct lzms_compressor *c)
 388 {
 389         unsigned slot = 0;
 390         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
 391                 if (len >= lzms_length_slot_base[slot + 1])
 392                         slot++;
 393                 c->fast_length_slot_tab[len] = slot;
 394         }
 395 }
 396
 397 /* Generate the acceleration tables for offset slots.  */
 398 static void
 399 lzms_init_offset_slot_tabs(struct lzms_compressor *c)
 400 {
 401         u32 offset;
 402         unsigned slot = 0;
 403
 404         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
 405         for (offset = 1; offset < 0xe4a5; offset++) {
 406                 if (offset >= lzms_offset_slot_base[slot + 1])
 407                         slot++;
 408                 c->offset_slot_tab_1[offset] = slot;
 409         }
 410
 411         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
 412         for (; offset < 0x3de4a5; offset += (u32)1 << 11) {
 413                 if (offset >= lzms_offset_slot_base[slot + 1])
 414                         slot++;
 415                 c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot;
 416         }
 417
 418         /* slots [427, 799); 16 <= num_extra_bits  */
 419         for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) {
 420                 if (offset >= lzms_offset_slot_base[slot + 1])
 421                         slot++;
 422                 c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot;
 423         }
 424 }
 425
 426 /*
 427  * Return the length slot for the specified match length, using the compressor's
 428  * acceleration table if the length is small enough.
 429  */
 430 static inline unsigned
 431 lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
 432 {
 433         if (likely(length <= MAX_FAST_LENGTH))
 434                 return c->fast_length_slot_tab[length];
 435         return lzms_get_length_slot(length);
 436 }
 437
 438 /*
 439  * Return the offset slot for the specified match offset, using the compressor's
 440  * acceleration tables to speed up the mapping.
 441  */
 442 static inline unsigned
 443 lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
 444 {
 445         if (offset < 0xe4a5)
 446                 return c->offset_slot_tab_1[offset];
 447         offset -= 0xe4a5;
 448         if (offset < 0x3d0000)
 449                 return c->offset_slot_tab_2[offset >> 11];
 450         return c->offset_slot_tab_3[offset >> 16];
 451 }
 452
 453 /******************************************************************************
 454  *                             Range encoding                                 *
 455  ******************************************************************************/
 456
 457 /*
 458  * Initialize the range encoder @rc to write forwards to the specified buffer
 459  * @out that is @count 16-bit integers long.
 460  */
 461 static void
 462 lzms_range_encoder_init(struct lzms_range_encoder *rc, le16 *out, size_t count)
 463 {
 464         rc->lower_bound = 0;
 465         rc->range_size = 0xffffffff;
 466         rc->cache = 0;
 467         rc->cache_size = 1;
 468         rc->begin = out;
 469         rc->next = out - 1;
 470         rc->end = out + count;
 471 }
 472
 473 /*
 474  * Attempt to flush bits from the range encoder.
 475  *
 476  * The basic idea is that we're writing bits from @rc->lower_bound to the
 477  * output.  However, due to carrying, the writing of coding units with the
 478  * maximum value, as well as one prior coding unit, must be delayed until it is
 479  * determined whether a carry is needed.
 480  *
 481  * This is based on the public domain code for LZMA written by Igor Pavlov, but
 482  * with the following differences:
 483  *
 484  *      - In LZMS, 16-bit coding units are required rather than 8-bit.
 485  *
 486  *      - In LZMS, the first coding unit is not ignored by the decompressor, so
 487  *        the encoder cannot output a dummy value to that position.
 488  */
 489 static void
 490 lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
 491 {
 492         if ((u32)(rc->lower_bound) < 0xffff0000 ||
 493             (u32)(rc->lower_bound >> 32) != 0)
 494         {
 495                 /* Carry not needed (rc->lower_bound < 0xffff0000), or carry
 496                  * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit
 497                  * is 1).  */
 498                 do {
 499                         if (likely(rc->next >= rc->begin)) {
 500                                 if (rc->next != rc->end) {
 501                                         put_unaligned_u16_le(rc->cache +
 502                                                              (u16)(rc->lower_bound >> 32),
 503                                                              rc->next++);
 504                                 }
 505                         } else {
 506                                 rc->next++;
 507                         }
 508                         rc->cache = 0xffff;
 509                 } while (--rc->cache_size != 0);
 510
 511                 rc->cache = (rc->lower_bound >> 16) & 0xffff;
 512         }
 513         ++rc->cache_size;
 514         rc->lower_bound = (rc->lower_bound & 0xffff) << 16;
 515 }
 516
 517 static bool
 518 lzms_range_encoder_flush(struct lzms_range_encoder *rc)
 519 {
 520         for (int i = 0; i < 4; i++)
 521                 lzms_range_encoder_shift_low(rc);
 522         return rc->next != rc->end;
 523 }
 524
 525 /*
 526  * Encode the next bit using the range encoder.
 527  *
 528  * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
 529  * bit is 0 rather than 1.
 530  */
 531 static inline void
 532 lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
 533 {
 534         /* Normalize if needed.  */
 535         if (rc->range_size <= 0xffff) {
 536                 rc->range_size <<= 16;
 537                 lzms_range_encoder_shift_low(rc);
 538         }
 539
 540         u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob;
 541         if (bit == 0) {
 542                 rc->range_size = bound;
 543         } else {
 544                 rc->lower_bound += bound;
 545                 rc->range_size -= bound;
 546         }
 547 }
 548
 549 /*
 550  * Encode a bit.  This wraps around lzms_range_encode_bit() to handle using and
 551  * updating the state and its corresponding probability entry.
 552  */
 553 static inline void
 554 lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
 555                 struct lzms_probability_entry *probs,
 556                 struct lzms_range_encoder *rc)
 557 {
 558         struct lzms_probability_entry *prob_entry;
 559         u32 prob;
 560
 561         /* Load the probability entry for the current state.  */
 562         prob_entry = &probs[*state_p];
 563
 564         /* Update the state based on the next bit.  */
 565         *state_p = ((*state_p << 1) | bit) & (num_states - 1);
 566
 567         /* Get the probability that the bit is 0.  */
 568         prob = lzms_get_probability(prob_entry);
 569
 570         /* Update the probability entry.  */
 571         lzms_update_probability_entry(prob_entry, bit);
 572
 573         /* Encode the bit using the range encoder.  */
 574         lzms_range_encode_bit(rc, bit, prob);
 575 }
 576
 577 /* Helper functions for encoding bits in the various decision classes  */
 578
 579 static void
 580 lzms_encode_main_bit(struct lzms_compressor *c, int bit)
 581 {
 582         lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
 583                         c->probs.main, &c->rc);
 584 }
 585
 586 static void
 587 lzms_encode_match_bit(struct lzms_compressor *c, int bit)
 588 {
 589         lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
 590                         c->probs.match, &c->rc);
 591 }
 592
 593 static void
 594 lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
 595 {
 596         lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
 597                         c->probs.lz, &c->rc);
 598 }
 599
 600 static void
 601 lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
 602 {
 603         lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
 604                         c->probs.lz_rep[idx], &c->rc);
 605 }
 606
 607 static void
 608 lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
 609 {
 610         lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
 611                         c->probs.delta, &c->rc);
 612 }
 613
 614 static void
 615 lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
 616 {
 617         lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
 618                         c->probs.delta_rep[idx], &c->rc);
 619 }
 620
 621 /******************************************************************************
 622  *                   Huffman encoding and verbatim bits                       *
 623  ******************************************************************************/
 624
 625 /*
 626  * Initialize the output bitstream @os to write backwards to the specified
 627  * buffer @out that is @count 16-bit integers long.
 628  */
 629 static void
 630 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
 631                            le16 *out, size_t count)
 632 {
 633         os->bitbuf = 0;
 634         os->bitcount = 0;
 635         os->next = out + count;
 636         os->begin = out;
 637 }
 638
 639 /*
 640  * Write some bits, contained in the low-order @num_bits bits of @bits, to the
 641  * output bitstream @os.
 642  *
 643  * @max_num_bits is a compile-time constant that specifies the maximum number of
 644  * bits that can ever be written at this call site.
 645  */
 646 static inline void
 647 lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
 648                 const unsigned num_bits, const unsigned max_num_bits)
 649 {
 650         /* Add the bits to the bit buffer variable.  */
 651         os->bitcount += num_bits;
 652         os->bitbuf = (os->bitbuf << num_bits) | bits;
 653
 654         /* Check whether any coding units need to be written.  */
 655         while (os->bitcount >= 16) {
 656
 657                 os->bitcount -= 16;
 658
 659                 /* Write a coding unit, unless it would underflow the buffer. */
 660                 if (os->next != os->begin)
 661                         put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next);
 662
 663                 /* Optimization for call sites that never write more than 16
 664                  * bits at once.  */
 665                 if (max_num_bits <= 16)
 666                         break;
 667         }
 668 }
 669
 670 /*
 671  * Flush the output bitstream, ensuring that all bits written to it have been
 672  * written to memory.  Return %true if all bits have been output successfully,
 673  * or %false if an overrun occurred.
 674  */
 675 static bool
 676 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
 677 {
 678         if (os->next == os->begin)
 679                 return false;
 680
 681         if (os->bitcount != 0)
 682                 put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next);
 683
 684         return true;
 685 }
 686
 687 static void
 688 lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
 689 {
 690         make_canonical_huffman_code(rebuild_info->num_syms,
 691                                     LZMS_MAX_CODEWORD_LENGTH,
 692                                     rebuild_info->freqs,
 693                                     rebuild_info->lens,
 694                                     rebuild_info->codewords);
 695         rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq;
 696 }
 697
 698 static void
 699 lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info,
 700                        unsigned num_syms, unsigned rebuild_freq,
 701                        u32 *codewords, u8 *lens, u32 *freqs)
 702 {
 703         rebuild_info->num_syms = num_syms;
 704         rebuild_info->rebuild_freq = rebuild_freq;
 705         rebuild_info->codewords = codewords;
 706         rebuild_info->lens = lens;
 707         rebuild_info->freqs = freqs;
 708         lzms_init_symbol_frequencies(freqs, num_syms);
 709         lzms_build_huffman_code(rebuild_info);
 710 }
 711
 712 static void
 713 lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
 714 {
 715         lzms_build_huffman_code(rebuild_info);
 716         lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms);
 717 }
 718
 719 /*
 720  * Encode a symbol using the specified Huffman code.  Then, if the Huffman code
 721  * needs to be rebuilt, rebuild it and return true; otherwise return false.
 722  */
 723 static inline bool
 724 lzms_huffman_encode_symbol(unsigned sym,
 725                            const u32 *codewords, const u8 *lens, u32 *freqs,
 726                            struct lzms_output_bitstream *os,
 727                            struct lzms_huffman_rebuild_info *rebuild_info)
 728 {
 729         lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH);
 730         ++freqs[sym];
 731         if (--rebuild_info->num_syms_until_rebuild == 0) {
 732                 lzms_rebuild_huffman_code(rebuild_info);
 733                 return true;
 734         }
 735         return false;
 736 }
 737
 738 /* Helper routines to encode symbols using the various Huffman codes  */
 739
 740 static bool
 741 lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym)
 742 {
 743         return lzms_huffman_encode_symbol(sym, c->literal_codewords,
 744                                           c->literal_lens, c->literal_freqs,
 745                                           &c->os, &c->literal_rebuild_info);
 746 }
 747
 748 static bool
 749 lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym)
 750 {
 751         return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords,
 752                                           c->lz_offset_lens, c->lz_offset_freqs,
 753                                           &c->os, &c->lz_offset_rebuild_info);
 754 }
 755
 756 static bool
 757 lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym)
 758 {
 759         return lzms_huffman_encode_symbol(sym, c->length_codewords,
 760                                           c->length_lens, c->length_freqs,
 761                                           &c->os, &c->length_rebuild_info);
 762 }
 763
 764 static bool
 765 lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym)
 766 {
 767         return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords,
 768                                           c->delta_offset_lens, c->delta_offset_freqs,
 769                                           &c->os, &c->delta_offset_rebuild_info);
 770 }
 771
 772 static bool
 773 lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym)
 774 {
 775         return lzms_huffman_encode_symbol(sym, c->delta_power_codewords,
 776                                           c->delta_power_lens, c->delta_power_freqs,
 777                                           &c->os, &c->delta_power_rebuild_info);
 778 }
 779
 780 static void
 781 lzms_update_fast_length_costs(struct lzms_compressor *c);
 782
 783 /*
 784  * Encode a match length.  If this causes the Huffman code for length symbols to
 785  * be rebuilt, also update the length costs array used by the parser.
 786  */
 787 static void
 788 lzms_encode_length(struct lzms_compressor *c, u32 length)
 789 {
 790         unsigned slot = lzms_comp_get_length_slot(c, length);
 791
 792         if (lzms_encode_length_symbol(c, slot))
 793                 lzms_update_fast_length_costs(c);
 794
 795         lzms_write_bits(&c->os, length - lzms_length_slot_base[slot],
 796                         lzms_extra_length_bits[slot],
 797                         LZMS_MAX_EXTRA_LENGTH_BITS);
 798 }
 799
 800 /* Encode the offset of an LZ match.  */
 801 static void
 802 lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset)
 803 {
 804         unsigned slot = lzms_comp_get_offset_slot(c, offset);
 805
 806         lzms_encode_lz_offset_symbol(c, slot);
 807         lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot],
 808                         lzms_extra_offset_bits[slot],
 809                         LZMS_MAX_EXTRA_OFFSET_BITS);
 810 }
 811
 812 /* Encode the raw offset of a delta match.  */
 813 static void
 814 lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset)
 815 {
 816         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
 817
 818         lzms_encode_delta_offset_symbol(c, slot);
 819         lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot],
 820                         lzms_extra_offset_bits[slot],
 821                         LZMS_MAX_EXTRA_OFFSET_BITS);
 822 }
 823
 824 /******************************************************************************
 825  *                             Item encoding                                  *
 826  ******************************************************************************/
 827
 828 /* Encode the specified item, which may be a literal or any type of match.  */
 829 static void
 830 lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source)
 831 {
 832         /* Main bit: 0 = literal, 1 = match  */
 833         int main_bit = (length > 1);
 834         lzms_encode_main_bit(c, main_bit);
 835
 836         if (!main_bit) {
 837                 /* Literal  */
 838                 unsigned literal = source;
 839                 lzms_encode_literal_symbol(c, literal);
 840         } else {
 841                 /* Match  */
 842
 843                 /* Match bit: 0 = LZ match, 1 = delta match  */
 844                 int match_bit = (source & DELTA_SOURCE_TAG) != 0;
 845                 lzms_encode_match_bit(c, match_bit);
 846
 847                 if (!match_bit) {
 848                         /* LZ match  */
 849
 850                         /* LZ bit: 0 = explicit offset, 1 = repeat offset  */
 851                         int lz_bit = (source < LZMS_NUM_LZ_REPS);
 852                         lzms_encode_lz_bit(c, lz_bit);
 853
 854                         if (!lz_bit) {
 855                                 /* Explicit offset LZ match  */
 856                                 u32 offset = source - (LZMS_NUM_LZ_REPS - 1);
 857                                 lzms_encode_lz_offset(c, offset);
 858                         } else {
 859                                 /* Repeat offset LZ match  */
 860                                 int rep_idx = source;
 861                                 for (int i = 0; i < rep_idx; i++)
 862                                         lzms_encode_lz_rep_bit(c, 1, i);
 863                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
 864                                         lzms_encode_lz_rep_bit(c, 0, rep_idx);
 865                         }
 866                 } else {
 867                         /* Delta match  */
 868
 869                         source &= ~DELTA_SOURCE_TAG;
 870
 871                         /* Delta bit: 0 = explicit offset, 1 = repeat offset  */
 872                         int delta_bit = (source < LZMS_NUM_DELTA_REPS);
 873                         lzms_encode_delta_bit(c, delta_bit);
 874
 875                         if (!delta_bit) {
 876                                 /* Explicit offset delta match  */
 877                                 u32 power = source >> DELTA_SOURCE_POWER_SHIFT;
 878                                 u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) -
 879                                                  (LZMS_NUM_DELTA_REPS - 1);
 880                                 lzms_encode_delta_power_symbol(c, power);
 881                                 lzms_encode_delta_raw_offset(c, raw_offset);
 882                         } else {
 883                                 /* Repeat offset delta match  */
 884                                 int rep_idx = source;
 885                                 for (int i = 0; i < rep_idx; i++)
 886                                         lzms_encode_delta_rep_bit(c, 1, i);
 887                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
 888                                         lzms_encode_delta_rep_bit(c, 0, rep_idx);
 889                         }
 890                 }
 891
 892                 /* Match length (encoded the same way for any match type)  */
 893                 lzms_encode_length(c, length);
 894         }
 895 }
 896
 897 /* Encode a list of matches and literals chosen by the parsing algorithm.  */
 898 static void
 899 lzms_encode_nonempty_item_list(struct lzms_compressor *c,
 900                                struct lzms_optimum_node *end_node)
 901 {
 902         /* Since we've stored at each node the item we took to arrive at that
 903          * node, we can trace our chosen path in backwards order.  However, for
 904          * encoding we need to trace our chosen path in forwards order.  To make
 905          * this possible, the following loop moves the items from their
 906          * destination nodes to their source nodes, which effectively reverses
 907          * the path.  (Think of it like reversing a singly-linked list.)  */
 908         struct lzms_optimum_node *cur_node = end_node;
 909         struct lzms_item saved_item = cur_node->item;
 910         do {
 911                 struct lzms_item item = saved_item;
 912                 if (cur_node->num_extra_items > 0) {
 913                         /* Handle an arrival via multi-item lookahead.  */
 914                         unsigned i = 0;
 915                         struct lzms_optimum_node *orig_node = cur_node;
 916                         do {
 917                                 cur_node -= item.length;
 918                                 cur_node->item = item;
 919                                 item = orig_node->extra_items[i];
 920                         } while (++i != orig_node->num_extra_items);
 921                 }
 922                 cur_node -= item.length;
 923                 saved_item = cur_node->item;
 924                 cur_node->item = item;
 925         } while (cur_node != c->optimum_nodes);
 926
 927         /* Now trace the chosen path in forwards order, encoding each item.  */
 928         do {
 929                 lzms_encode_item(c, cur_node->item.length, cur_node->item.source);
 930                 cur_node += cur_node->item.length;
 931         } while (cur_node != end_node);
 932 }
 933
 934 static inline void
 935 lzms_encode_item_list(struct lzms_compressor *c,
 936                       struct lzms_optimum_node *end_node)
 937 {
 938         if (end_node != c->optimum_nodes)
 939                 lzms_encode_nonempty_item_list(c, end_node);
 940 }
 941
 942 /******************************************************************************
 943  *                             Cost evaluation                                *
 944  ******************************************************************************/
 945
 946 /*
 947  * If p is the predicted probability of the next bit being a 0, then the number
 948  * of bits required to encode a 0 bit using a binary range encoder is the real
 949  * number -log2(p), and the number of bits required to encode a 1 bit is the
 950  * real number -log2(1 - p).  To avoid computing either of these expressions at
 951  * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from
 952  * probability to cost for each possible probability.  Specifically, the array
 953  * indices are the numerators of the possible probabilities in LZMS, where the
 954  * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the
 955  * bit costs multiplied by 1<<COST_SHIFT and rounded to the nearest integer.
 956  * Furthermore, the values stored for 0% and 100% probabilities are equal to the
 957  * adjacent values, since these probabilities are not actually permitted.  This
 958  * allows us to use the num_recent_zero_bits value from the
 959  * lzms_probability_entry as the array index without fixing up these two special
 960  * cases.
 961  */
 962 static const u32 lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR + 1] = {
 963         384, 384, 320, 283, 256, 235, 219, 204,
 964         192, 181, 171, 163, 155, 147, 140, 134,
 965         128, 122, 117, 112, 107, 103, 99,  94,
 966         91,  87,  83,  80,  76,  73,  70,  67,
 967         64,  61,  58,  56,  53,  51,  48,  46,
 968         43,  41,  39,  37,  35,  33,  30,  29,
 969         27,  25,  23,  21,  19,  17,  16,  14,
 970         12,  11,  9,   8,   6,   4,   3,   1,
 971         1
 972 };
 973
 974 static inline void
 975 check_cost_shift(void)
 976 {
 977         /* lzms_bit_costs is hard-coded to the current COST_SHIFT.  */
 978         STATIC_ASSERT(COST_SHIFT == 6);
 979 }
 980
 981 #if 0
 982 #include <math.h>
 983
 984 static void
 985 lzms_compute_bit_costs(void)
 986 {
 987         for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) {
 988                 u32 prob = i;
 989                 if (prob == 0)
 990                         prob++;
 991                 else if (prob == LZMS_PROBABILITY_DENOMINATOR)
 992                         prob--;
 993
 994                 lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) *
 995                                           (1 << COST_SHIFT));
 996         }
 997 }
 998 #endif
 999
1000 /* Return the cost to encode a 0 bit in the specified context.  */
1001 static inline u32
1002 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
1003 {
1004         return lzms_bit_costs[probs[state].num_recent_zero_bits];
1005 }
1006
1007 /* Return the cost to encode a 1 bit in the specified context.  */
1008 static inline u32
1009 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
1010 {
1011         return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
1012                               probs[state].num_recent_zero_bits];
1013 }
1014
1015 /* Return the cost to encode a literal, including the main bit.  */
1016 static inline u32
1017 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
1018 {
1019         return lzms_bit_0_cost(main_state, c->probs.main) +
1020                 ((u32)c->literal_lens[literal] << COST_SHIFT);
1021 }
1022
1023 /* Update 'fast_length_cost_tab' to use the latest Huffman code.  */
1024 static void
1025 lzms_update_fast_length_costs(struct lzms_compressor *c)
1026 {
1027         int slot = -1;
1028         u32 cost = 0;
1029         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
1030                 if (len >= lzms_length_slot_base[slot + 1]) {
1031                         slot++;
1032                         cost = (u32)(c->length_lens[slot] +
1033                                      lzms_extra_length_bits[slot]) << COST_SHIFT;
1034                 }
1035                 c->fast_length_cost_tab[len] = cost;
1036         }
1037 }
1038
1039 /* Return the cost to encode the specified match length, which must not exceed
1040  * MAX_FAST_LENGTH.  */
1041 static inline u32
1042 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
1043 {
1044         return c->fast_length_cost_tab[length];
1045 }
1046
1047 /* Return the cost to encode the specified LZ match offset.  */
1048 static inline u32
1049 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
1050 {
1051         unsigned slot = lzms_comp_get_offset_slot(c, offset);
1052         u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot];
1053         return num_bits << COST_SHIFT;
1054 }
1055
1056 /* Return the cost to encode the specified delta power and raw offset.  */
1057 static inline u32
1058 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
1059 {
1060         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
1061         u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] +
1062                        lzms_extra_offset_bits[slot];
1063         return num_bits << COST_SHIFT;
1064 }
1065
1066 /******************************************************************************
1067  *                              Adaptive state                                *
1068  ******************************************************************************/
1069
1070 static void
1071 lzms_init_adaptive_state(struct lzms_adaptive_state *state)
1072 {
1073         for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++)
1074                 state->recent_lz_offsets[i] = i + 1;
1075         state->prev_lz_offset = 0;
1076         state->upcoming_lz_offset = 0;
1077
1078         for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++)
1079                 state->recent_delta_pairs[i] = i + 1;
1080         state->prev_delta_pair = 0;
1081         state->upcoming_delta_pair = 0;
1082
1083         state->main_state = 0;
1084         state->match_state = 0;
1085         state->lz_state = 0;
1086         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
1087                 state->lz_rep_states[i] = 0;
1088         state->delta_state = 0;
1089         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
1090                 state->delta_rep_states[i] = 0;
1091 }
1092
1093 /*
1094  * Update the LRU queues for match sources when advancing by one item.
1095  *
1096  * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more
1097  * complex because:
1098  *      - there are separate queues for LZ and delta matches
1099  *      - updates to the queues are delayed by one encoded item (this prevents
1100  *        sources from being bumped up to index 0 too early)
1101  */
1102 static void
1103 lzms_update_lru_queues(struct lzms_adaptive_state *state)
1104 {
1105         if (state->prev_lz_offset != 0) {
1106                 for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--)
1107                         state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i];
1108                 state->recent_lz_offsets[0] = state->prev_lz_offset;
1109         }
1110         state->prev_lz_offset = state->upcoming_lz_offset;
1111
1112         if (state->prev_delta_pair != 0) {
1113                 for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--)
1114                         state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i];
1115                 state->recent_delta_pairs[0] = state->prev_delta_pair;
1116         }
1117         state->prev_delta_pair = state->upcoming_delta_pair;
1118 }
1119
1120 static inline void
1121 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
1122 {
1123         *state_p = ((*state_p << 1) | bit) & (num_states - 1);
1124 }
1125
1126 static inline void
1127 lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
1128 {
1129         lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
1130 }
1131
1132 static inline void
1133 lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
1134 {
1135         lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
1136 }
1137
1138 static inline void
1139 lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
1140 {
1141         lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
1142 }
1143
1144 static inline void
1145 lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1146 {
1147         for (int i = 0; i < rep_idx; i++)
1148                 lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS);
1149
1150         if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1151                 lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
1152 }
1153
1154 static inline void
1155 lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
1156 {
1157         lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
1158 }
1159
1160 static inline void
1161 lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1162 {
1163         for (int i = 0; i < rep_idx; i++)
1164                 lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS);
1165
1166         if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1167                 lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS);
1168 }
1169
1170 /******************************************************************************
1171  *                              Matchfinding                                  *
1172  ******************************************************************************/
1173
1174 /* Note: this code just handles finding delta matches.  The code for finding LZ
1175  * matches is elsewhere.  */
1176
1177
1178 /* Initialize the delta matchfinder for a new input buffer.  */
1179 static void
1180 lzms_init_delta_matchfinder(struct lzms_compressor *c)
1181 {
1182         /* Set all entries to use an invalid power, which will never match.  */
1183         STATIC_ASSERT(NUM_POWERS_TO_CONSIDER < (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
1184         memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
1185
1186         /* Initialize the next hash code for each power.  We can just use zeroes
1187          * initially; it doesn't really matter.  */
1188         for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++)
1189                 c->next_delta_hashes[i] = 0;
1190 }
1191
1192 /*
1193  * Compute a DELTA_HASH_ORDER-bit hash code for the first
1194  * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
1195  * delta context with the specified @span.
1196  */
1197 static inline u32
1198 lzms_delta_hash(const u8 *p, const u32 pos, u32 span)
1199 {
1200         /* A delta match has a certain span and an offset that is a multiple of
1201          * that span.  To reduce wasted space we use a single combined hash
1202          * table for all spans and positions, but to minimize collisions we
1203          * include in the hash code computation the span and the low-order bits
1204          * of the current position.  */
1205
1206         STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1207         u8 d0 = *(p + 0) - *(p + 0 - span);
1208         u8 d1 = *(p + 1) - *(p + 1 - span);
1209         u8 d2 = *(p + 2) - *(p + 2 - span);
1210         u32 v = ((span + (pos & (span - 1))) << 24) |
1211                 ((u32)d2 << 16) | ((u32)d1 << 8) | d0;
1212         return lz_hash(v, DELTA_HASH_ORDER);
1213 }
1214
1215 /*
1216  * Given a match between @in_next and @matchptr in a delta context with the
1217  * specified @span and having the initial @len, extend the match as far as
1218  * possible, up to a limit of @max_len.
1219  */
1220 static inline u32
1221 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
1222                         u32 len, u32 max_len, u32 span)
1223 {
1224         while (len < max_len &&
1225                (u8)(*(in_next + len) - *(in_next + len - span)) ==
1226                (u8)(*(matchptr + len) - *(matchptr + len - span)))
1227         {
1228                 len++;
1229         }
1230         return len;
1231 }
1232
1233 static void
1234 lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
1235                                   const u8 *in_next, u32 count)
1236 {
1237         u32 pos = in_next - c->in_buffer;
1238         if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1))
1239                 return;
1240         do {
1241                 /* Update the hash table for each power.  */
1242                 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1243                         const u32 span = (u32)1 << power;
1244                         if (unlikely(pos < span))
1245                                 continue;
1246                         const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1247                         const u32 hash = c->next_delta_hashes[power];
1248                         c->delta_hash_table[hash] =
1249                                 (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1250                         c->next_delta_hashes[power] = next_hash;
1251                         prefetchw(&c->delta_hash_table[next_hash]);
1252                 }
1253         } while (in_next++, pos++, --count);
1254 }
1255
1256 /*
1257  * Skip the next @count bytes (don't search for matches at them).  @in_next
1258  * points to the first byte to skip.  The return value is @in_next + count.
1259  */
1260 static const u8 *
1261 lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
1262 {
1263         lcpit_matchfinder_skip_bytes(&c->mf, count);
1264         if (c->use_delta_matches)
1265                 lzms_delta_matchfinder_skip_bytes(c, in_next, count);
1266         return in_next + count;
1267 }
1268
1269 /******************************************************************************
1270  *                          "Near-optimal" parsing                            *
1271  ******************************************************************************/
1272
1273 /*
1274  * The main near-optimal parsing routine.
1275  *
1276  * Briefly, the algorithm does an approximate minimum-cost path search to find a
1277  * "near-optimal" sequence of matches and literals to output, based on the
1278  * current cost model.  The algorithm steps forward, position by position (byte
1279  * by byte), and updates the minimum cost path to reach each later position that
1280  * can be reached using a match or literal from the current position.  This is
1281  * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
1282  * the graph edges are possible matches/literals to code, and the cost of each
1283  * edge is the estimated number of bits (scaled up by COST_SHIFT) that will be
1284  * required to output the corresponding match or literal.  But one difference is
1285  * that we actually compute the lowest-cost path in pieces, where each piece is
1286  * terminated when there are no choices to be made.
1287  *
1288  * The costs of literals and matches are estimated using the range encoder
1289  * states and the semi-adaptive Huffman codes.  Except for range encoding
1290  * states, costs are assumed to be constant throughout a single run of the
1291  * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data.  This
1292  * introduces a source of non-optimality because the probabilities and Huffman
1293  * codes can change over this part of the data.  And of course, there are
1294  * various other reasons why the result isn't optimal in terms of compression
1295  * ratio.
1296  */
1297 static void
1298 lzms_near_optimal_parse(struct lzms_compressor *c)
1299 {
1300         const u8 *in_next = c->in_buffer;
1301         const u8 * const in_end = &c->in_buffer[c->in_nbytes];
1302         struct lzms_optimum_node *cur_node;
1303         struct lzms_optimum_node *end_node;
1304
1305         /* Set initial length costs for lengths <= MAX_FAST_LENGTH.  */
1306         lzms_update_fast_length_costs(c);
1307
1308         /* Set up the initial adaptive state.  */
1309         lzms_init_adaptive_state(&c->optimum_nodes[0].state);
1310
1311 begin:
1312         /* Start building a new list of items, which will correspond to the next
1313          * piece of the overall minimum-cost path.  */
1314
1315         cur_node = c->optimum_nodes;
1316         cur_node->cost = 0;
1317         end_node = cur_node;
1318
1319         if (in_next == in_end)
1320                 return;
1321
1322         /* The following loop runs once for each per byte in the input buffer,
1323          * except in a few shortcut cases.  */
1324         for (;;) {
1325                 u32 num_matches;
1326
1327                 /* Repeat offset LZ matches  */
1328                 if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS &&
1329                            in_end - in_next >= 2))
1330                 {
1331                         for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) {
1332
1333                                 /* Looking for a repeat offset LZ match at queue
1334                                  * index @rep_idx  */
1335
1336                                 const u32 offset = cur_node->state.recent_lz_offsets[rep_idx];
1337                                 const u8 * const matchptr = in_next - offset;
1338
1339                                 /* Check the first 2 bytes before entering the extension loop.  */
1340                                 if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr))
1341                                         continue;
1342
1343                                 /* Extend the match to its full length.  */
1344                                 const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next);
1345
1346                                 /* Early out for long repeat offset LZ match */
1347                                 if (rep_len >= c->mf.nice_match_len) {
1348
1349                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1350
1351                                         lzms_encode_item_list(c, cur_node);
1352                                         lzms_encode_item(c, rep_len, rep_idx);
1353
1354                                         c->optimum_nodes[0].state = cur_node->state;
1355                                         cur_node = &c->optimum_nodes[0];
1356
1357                                         cur_node->state.upcoming_lz_offset =
1358                                                 cur_node->state.recent_lz_offsets[rep_idx];
1359                                         cur_node->state.upcoming_delta_pair = 0;
1360                                         for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1361                                                 cur_node->state.recent_lz_offsets[i] =
1362                                                         cur_node->state.recent_lz_offsets[i + 1];
1363                                         lzms_update_lru_queues(&cur_node->state);
1364                                         lzms_update_main_state(&cur_node->state, 1);
1365                                         lzms_update_match_state(&cur_node->state, 0);
1366                                         lzms_update_lz_state(&cur_node->state, 1);
1367                                         lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1368                                         goto begin;
1369                                 }
1370
1371                                 while (end_node < cur_node + rep_len)
1372                                         (++end_node)->cost = INFINITE_COST;
1373
1374                                 u32 base_cost = cur_node->cost +
1375                                                 lzms_bit_1_cost(cur_node->state.main_state,
1376                                                                 c->probs.main) +
1377                                                 lzms_bit_0_cost(cur_node->state.match_state,
1378                                                                 c->probs.match) +
1379                                                 lzms_bit_1_cost(cur_node->state.lz_state,
1380                                                                 c->probs.lz);
1381
1382                                 for (int i = 0; i < rep_idx; i++)
1383                                         base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
1384                                                                      c->probs.lz_rep[i]);
1385
1386                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1387                                         base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
1388                                                                      c->probs.lz_rep[rep_idx]);
1389
1390                                 u32 len = 2;
1391                                 do {
1392                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1393                                         if (cost < (cur_node + len)->cost) {
1394                                                 (cur_node + len)->cost = cost;
1395                                                 (cur_node + len)->item = (struct lzms_item) {
1396                                                         .length = len,
1397                                                         .source = rep_idx,
1398                                                 };
1399                                                 (cur_node + len)->num_extra_items = 0;
1400                                         }
1401                                 } while (++len <= rep_len);
1402
1403
1404                                 /* try LZ-rep + lit + LZ-rep0  */
1405                                 if (c->try_lzrep_lit_lzrep0 &&
1406                                     in_end - (in_next + rep_len) >= 3 &&
1407                                     load_u16_unaligned(in_next + rep_len + 1) ==
1408                                     load_u16_unaligned(matchptr + rep_len + 1))
1409                                 {
1410                                         const u32 rep0_len = lz_extend(in_next + rep_len + 1,
1411                                                                        matchptr + rep_len + 1,
1412                                                                        2,
1413                                                                        min(c->mf.nice_match_len,
1414                                                                            in_end - (in_next + rep_len + 1)));
1415
1416                                         unsigned main_state = cur_node->state.main_state;
1417                                         unsigned match_state = cur_node->state.match_state;
1418                                         unsigned lz_state = cur_node->state.lz_state;
1419                                         unsigned lz_rep0_state = cur_node->state.lz_rep_states[0];
1420
1421                                         /* update states for LZ-rep  */
1422                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1423                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1424                                         lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS;
1425                                         lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) %
1426                                                                 LZMS_NUM_LZ_REP_PROBS;
1427
1428                                         /* LZ-rep cost  */
1429                                         u32 cost = base_cost + lzms_fast_length_cost(c, rep_len);
1430
1431                                         /* add literal cost  */
1432                                         cost += lzms_literal_cost(c, main_state, *(in_next + rep_len));
1433
1434                                         /* update state for literal  */
1435                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1436
1437                                         /* add LZ-rep0 cost  */
1438                                         cost += lzms_bit_1_cost(main_state, c->probs.main) +
1439                                                 lzms_bit_0_cost(match_state, c->probs.match) +
1440                                                 lzms_bit_1_cost(lz_state, c->probs.lz) +
1441                                                 lzms_bit_0_cost(lz_rep0_state, c->probs.lz_rep[0]) +
1442                                                 lzms_fast_length_cost(c, rep0_len);
1443
1444                                         const u32 total_len = rep_len + 1 + rep0_len;
1445
1446                                         while (end_node < cur_node + total_len)
1447                                                 (++end_node)->cost = INFINITE_COST;
1448
1449                                         if (cost < (cur_node + total_len)->cost) {
1450                                                 (cur_node + total_len)->cost = cost;
1451                                                 (cur_node + total_len)->item = (struct lzms_item) {
1452                                                         .length = rep0_len,
1453                                                         .source = 0,
1454                                                 };
1455                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1456                                                         .length = 1,
1457                                                         .source = *(in_next + rep_len),
1458                                                 };
1459                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1460                                                         .length = rep_len,
1461                                                         .source = rep_idx,
1462                                                 };
1463                                                 (cur_node + total_len)->num_extra_items = 2;
1464                                         }
1465                                 }
1466                         }
1467                 }
1468
1469                 /* Repeat offset delta matches  */
1470                 if (c->use_delta_matches &&
1471                     likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 &&
1472                            (in_end - in_next >= 2)))
1473                 {
1474                         for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) {
1475
1476                                 /* Looking for a repeat offset delta match at
1477                                  * queue index @rep_idx  */
1478
1479                                 const u32 pair = cur_node->state.recent_delta_pairs[rep_idx];
1480                                 const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT;
1481                                 const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK;
1482                                 const u32 span = (u32)1 << power;
1483                                 const u32 offset = raw_offset << power;
1484                                 const u8 * const matchptr = in_next - offset;
1485
1486                                 /* Check the first 2 bytes before entering the
1487                                  * extension loop.  */
1488                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1489                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1490                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1491                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))))
1492                                         continue;
1493
1494                                 /* Extend the match to its full length.  */
1495                                 const u32 rep_len = lzms_extend_delta_match(in_next, matchptr,
1496                                                                             2, in_end - in_next,
1497                                                                             span);
1498
1499                                 /* Early out for long repeat offset delta match */
1500                                 if (rep_len >= c->mf.nice_match_len) {
1501
1502                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1503
1504                                         lzms_encode_item_list(c, cur_node);
1505                                         lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx);
1506
1507                                         c->optimum_nodes[0].state = cur_node->state;
1508                                         cur_node = &c->optimum_nodes[0];
1509
1510                                         cur_node->state.upcoming_delta_pair = pair;
1511                                         cur_node->state.upcoming_lz_offset = 0;
1512                                         for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1513                                                 cur_node->state.recent_delta_pairs[i] =
1514                                                         cur_node->state.recent_delta_pairs[i + 1];
1515                                         lzms_update_lru_queues(&cur_node->state);
1516                                         lzms_update_main_state(&cur_node->state, 1);
1517                                         lzms_update_match_state(&cur_node->state, 1);
1518                                         lzms_update_delta_state(&cur_node->state, 1);
1519                                         lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1520                                         goto begin;
1521                                 }
1522
1523                                 while (end_node < cur_node + rep_len)
1524                                         (++end_node)->cost = INFINITE_COST;
1525
1526                                 u32 base_cost = cur_node->cost +
1527                                                 lzms_bit_1_cost(cur_node->state.main_state,
1528                                                                 c->probs.main) +
1529                                                 lzms_bit_1_cost(cur_node->state.match_state,
1530                                                                 c->probs.match) +
1531                                                 lzms_bit_1_cost(cur_node->state.delta_state,
1532                                                                 c->probs.delta);
1533
1534                                 for (int i = 0; i < rep_idx; i++)
1535                                         base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
1536                                                                      c->probs.delta_rep[i]);
1537
1538                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1539                                         base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
1540                                                                      c->probs.delta_rep[rep_idx]);
1541
1542                                 u32 len = 2;
1543                                 do {
1544                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1545                                         if (cost < (cur_node + len)->cost) {
1546                                                 (cur_node + len)->cost = cost;
1547                                                 (cur_node + len)->item = (struct lzms_item) {
1548                                                         .length = len,
1549                                                         .source = DELTA_SOURCE_TAG | rep_idx,
1550                                                 };
1551                                                 (cur_node + len)->num_extra_items = 0;
1552                                         }
1553                                 } while (++len <= rep_len);
1554                         }
1555                 }
1556
1557                 /* Explicit offset LZ matches  */
1558                 num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches);
1559                 if (num_matches) {
1560
1561                         u32 best_len = c->matches[0].length;
1562
1563                         /* Early out for long explicit offset LZ match  */
1564                         if (best_len >= c->mf.nice_match_len) {
1565
1566                                 const u32 offset = c->matches[0].offset;
1567
1568                                 /* Extend the match as far as possible.
1569                                  * This is necessary because the LCP-interval
1570                                  * tree matchfinder only reports up to
1571                                  * nice_match_len bytes.  */
1572                                 best_len = lz_extend(in_next, in_next - offset,
1573                                                      best_len, in_end - in_next);
1574
1575                                 in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1);
1576
1577                                 lzms_encode_item_list(c, cur_node);
1578                                 lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1);
1579
1580                                 c->optimum_nodes[0].state = cur_node->state;
1581                                 cur_node = &c->optimum_nodes[0];
1582
1583                                 cur_node->state.upcoming_lz_offset = offset;
1584                                 cur_node->state.upcoming_delta_pair = 0;
1585                                 lzms_update_lru_queues(&cur_node->state);
1586                                 lzms_update_main_state(&cur_node->state, 1);
1587                                 lzms_update_match_state(&cur_node->state, 0);
1588                                 lzms_update_lz_state(&cur_node->state, 0);
1589                                 goto begin;
1590                         }
1591
1592                         while (end_node < cur_node + best_len)
1593                                 (++end_node)->cost = INFINITE_COST;
1594
1595                         u32 base_cost = cur_node->cost +
1596                                         lzms_bit_1_cost(cur_node->state.main_state,
1597                                                         c->probs.main) +
1598                                         lzms_bit_0_cost(cur_node->state.match_state,
1599                                                         c->probs.match) +
1600                                         lzms_bit_0_cost(cur_node->state.lz_state,
1601                                                         c->probs.lz);
1602
1603                         if (c->try_lzmatch_lit_lzrep0 &&
1604                             likely(in_end - (in_next + c->matches[0].length) >= 3))
1605                         {
1606                                 /* try LZ-match + lit + LZ-rep0  */
1607
1608                                 u32 l = 2;
1609                                 u32 i = num_matches - 1;
1610                                 do {
1611                                         const u32 len = c->matches[i].length;
1612                                         const u32 offset = c->matches[i].offset;
1613                                         const u32 position_cost = base_cost +
1614                                                                   lzms_lz_offset_cost(c, offset);
1615                                         do {
1616                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1617                                                 if (cost < (cur_node + l)->cost) {
1618                                                         (cur_node + l)->cost = cost;
1619                                                         (cur_node + l)->item = (struct lzms_item) {
1620                                                                 .length = l,
1621                                                                 .source = offset + (LZMS_NUM_LZ_REPS - 1),
1622                                                         };
1623                                                         (cur_node + l)->num_extra_items = 0;
1624                                                 }
1625                                         } while (++l <= len);
1626
1627                                         const u8 * const matchptr = in_next - offset;
1628                                         if (load_u16_unaligned(matchptr + len + 1) !=
1629                                             load_u16_unaligned(in_next + len + 1))
1630                                                 continue;
1631
1632                                         const u32 rep0_len = lz_extend(in_next + len + 1,
1633                                                                        matchptr + len + 1,
1634                                                                        2,
1635                                                                        min(c->mf.nice_match_len,
1636                                                                            in_end - (in_next + len + 1)));
1637
1638                                         unsigned main_state = cur_node->state.main_state;
1639                                         unsigned match_state = cur_node->state.match_state;
1640                                         unsigned lz_state = cur_node->state.lz_state;
1641
1642                                         /* update states for LZ-match  */
1643                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1644                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1645                                         lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS;
1646
1647                                         /* LZ-match cost  */
1648                                         u32 cost = position_cost + lzms_fast_length_cost(c, len);
1649
1650                                         /* add literal cost  */
1651                                         cost += lzms_literal_cost(c, main_state, *(in_next + len));
1652
1653                                         /* update state for literal  */
1654                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1655
1656                                         /* add LZ-rep0 cost  */
1657                                         cost += lzms_bit_1_cost(main_state, c->probs.main) +
1658                                                 lzms_bit_0_cost(match_state, c->probs.match) +
1659                                                 lzms_bit_1_cost(lz_state, c->probs.lz) +
1660                                                 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1661                                                                 c->probs.lz_rep[0]) +
1662                                                 lzms_fast_length_cost(c, rep0_len);
1663
1664                                         const u32 total_len = len + 1 + rep0_len;
1665
1666                                         while (end_node < cur_node + total_len)
1667                                                 (++end_node)->cost = INFINITE_COST;
1668
1669                                         if (cost < (cur_node + total_len)->cost) {
1670                                                 (cur_node + total_len)->cost = cost;
1671                                                 (cur_node + total_len)->item = (struct lzms_item) {
1672                                                         .length = rep0_len,
1673                                                         .source = 0,
1674                                                 };
1675                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1676                                                         .length = 1,
1677                                                         .source = *(in_next + len),
1678                                                 };
1679                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1680                                                         .length = len,
1681                                                         .source = offset + LZMS_NUM_LZ_REPS - 1,
1682                                                 };
1683                                                 (cur_node + total_len)->num_extra_items = 2;
1684                                         }
1685                                 } while (i--);
1686                         } else {
1687                                 u32 l = 2;
1688                                 u32 i = num_matches - 1;
1689                                 do {
1690                                         u32 position_cost = base_cost +
1691                                                             lzms_lz_offset_cost(c, c->matches[i].offset);
1692                                         do {
1693                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1694                                                 if (cost < (cur_node + l)->cost) {
1695                                                         (cur_node + l)->cost = cost;
1696                                                         (cur_node + l)->item = (struct lzms_item) {
1697                                                                 .length = l,
1698                                                                 .source = c->matches[i].offset +
1699                                                                           (LZMS_NUM_LZ_REPS - 1),
1700                                                         };
1701                                                         (cur_node + l)->num_extra_items = 0;
1702                                                 }
1703                                         } while (++l <= c->matches[i].length);
1704                                 } while (i--);
1705                         }
1706                 }
1707
1708                 /* Explicit offset delta matches  */
1709                 if (c->use_delta_matches &&
1710                     likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1))
1711                 {
1712                         const u32 pos = in_next - c->in_buffer;
1713
1714                         /* Consider each possible power (log2 of span)  */
1715                         STATIC_ASSERT(NUM_POWERS_TO_CONSIDER <= LZMS_NUM_DELTA_POWER_SYMS);
1716                         for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1717
1718                                 const u32 span = (u32)1 << power;
1719
1720                                 if (unlikely(pos < span))
1721                                         continue;
1722
1723                                 const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1724                                 const u32 hash = c->next_delta_hashes[power];
1725                                 const u32 cur_match = c->delta_hash_table[hash];
1726
1727                                 c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1728                                 c->next_delta_hashes[power] = next_hash;
1729                                 prefetchw(&c->delta_hash_table[next_hash]);
1730
1731                                 if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
1732                                         continue;
1733
1734                                 const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK);
1735
1736                                 /* The offset must be a multiple of span.  */
1737                                 if (offset & (span - 1))
1738                                         continue;
1739
1740                                 const u8 * const matchptr = in_next - offset;
1741
1742                                 /* Check the first 3 bytes before entering the
1743                                  * extension loop.  */
1744                                 STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1745                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1746                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1747                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1748                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) ||
1749                                     ((u8)(*(in_next + 2) - *(in_next + 2 - span)) !=
1750                                      (u8)(*(matchptr + 2) - *(matchptr + 2 - span))))
1751                                         continue;
1752
1753                                 /* Extend the delta match to its full length.  */
1754                                 const u32 len = lzms_extend_delta_match(in_next,
1755                                                                         matchptr,
1756                                                                         NBYTES_HASHED_FOR_DELTA,
1757                                                                         in_end - in_next,
1758                                                                         span);
1759
1760                                 const u32 raw_offset = offset >> power;
1761
1762                                 if (unlikely(raw_offset > DELTA_SOURCE_RAW_OFFSET_MASK -
1763                                                           (LZMS_NUM_DELTA_REPS - 1)))
1764                                         continue;
1765
1766                                 const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
1767                                                  raw_offset;
1768                                 const u32 source = DELTA_SOURCE_TAG |
1769                                                    (pair + LZMS_NUM_DELTA_REPS - 1);
1770
1771                                 /* Early out for long explicit offset delta match  */
1772                                 if (len >= c->mf.nice_match_len) {
1773
1774                                         in_next = lzms_skip_bytes(c, len - 1, in_next + 1);
1775
1776                                         lzms_encode_item_list(c, cur_node);
1777                                         lzms_encode_item(c, len, source);
1778
1779                                         c->optimum_nodes[0].state = cur_node->state;
1780                                         cur_node = &c->optimum_nodes[0];
1781
1782                                         cur_node->state.upcoming_lz_offset = 0;
1783                                         cur_node->state.upcoming_delta_pair = pair;
1784                                         lzms_update_lru_queues(&cur_node->state);
1785                                         lzms_update_main_state(&cur_node->state, 1);
1786                                         lzms_update_match_state(&cur_node->state, 1);
1787                                         lzms_update_delta_state(&cur_node->state, 0);
1788                                         goto begin;
1789                                 }
1790
1791                                 while (end_node < cur_node + len)
1792                                         (++end_node)->cost = INFINITE_COST;
1793
1794                                 u32 base_cost = cur_node->cost +
1795                                                 lzms_bit_1_cost(cur_node->state.main_state,
1796                                                                 c->probs.main) +
1797                                                 lzms_bit_1_cost(cur_node->state.match_state,
1798                                                                 c->probs.match) +
1799                                                 lzms_bit_0_cost(cur_node->state.delta_state,
1800                                                                 c->probs.delta) +
1801                                                 lzms_delta_source_cost(c, power, raw_offset);
1802
1803                                 u32 l = NBYTES_HASHED_FOR_DELTA;
1804                                 do {
1805                                         u32 cost = base_cost + lzms_fast_length_cost(c, l);
1806                                         if (cost < (cur_node + l)->cost) {
1807                                                 (cur_node + l)->cost = cost;
1808                                                 (cur_node + l)->item = (struct lzms_item) {
1809                                                         .length = l,
1810                                                         .source = source,
1811                                                 };
1812                                                 (cur_node + l)->num_extra_items = 0;
1813                                         }
1814                                 } while (++l <= len);
1815                         }
1816                 }
1817
1818                 /* Literal  */
1819                 if (end_node < cur_node + 1)
1820                         (++end_node)->cost = INFINITE_COST;
1821                 const u32 cur_and_lit_cost = cur_node->cost +
1822                                              lzms_literal_cost(c, cur_node->state.main_state,
1823                                                                *in_next);
1824                 if (cur_and_lit_cost < (cur_node + 1)->cost) {
1825                         (cur_node + 1)->cost = cur_and_lit_cost;
1826                         (cur_node + 1)->item = (struct lzms_item) {
1827                                 .length = 1,
1828                                 .source = *in_next,
1829                         };
1830                         (cur_node + 1)->num_extra_items = 0;
1831                 } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) {
1832                         /* try lit + LZ-rep0  */
1833                         const u32 offset =
1834                                 (cur_node->state.prev_lz_offset) ?
1835                                         cur_node->state.prev_lz_offset :
1836                                         cur_node->state.recent_lz_offsets[0];
1837
1838                         if (load_u16_unaligned(in_next + 1) ==
1839                             load_u16_unaligned(in_next + 1 - offset))
1840                         {
1841                                 const u32 rep0_len = lz_extend(in_next + 1,
1842                                                                in_next + 1 - offset,
1843                                                                2,
1844                                                                min(in_end - (in_next + 1),
1845                                                                    c->mf.nice_match_len));
1846
1847                                 unsigned main_state = cur_node->state.main_state;
1848
1849                                 /* Update main_state after literal  */
1850                                 main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS;
1851
1852                                 /* Add cost of LZ-rep0  */
1853                                 const u32 cost = cur_and_lit_cost +
1854                                                  lzms_bit_1_cost(main_state, c->probs.main) +
1855                                                  lzms_bit_0_cost(cur_node->state.match_state,
1856                                                                  c->probs.match) +
1857                                                  lzms_bit_1_cost(cur_node->state.lz_state,
1858                                                                  c->probs.lz) +
1859                                                  lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1860                                                                  c->probs.lz_rep[0]) +
1861                                                  lzms_fast_length_cost(c, rep0_len);
1862
1863                                 const u32 total_len = 1 + rep0_len;
1864
1865                                 while (end_node < cur_node + total_len)
1866                                         (++end_node)->cost = INFINITE_COST;
1867
1868                                 if (cost < (cur_node + total_len)->cost) {
1869                                         (cur_node + total_len)->cost = cost;
1870                                         (cur_node + total_len)->item = (struct lzms_item) {
1871                                                 .length = rep0_len,
1872                                                 .source = 0,
1873                                         };
1874                                         (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1875                                                 .length = 1,
1876                                                 .source = *in_next,
1877                                         };
1878                                         (cur_node + total_len)->num_extra_items = 1;
1879                                 }
1880                         }
1881                 }
1882
1883                 /* Advance to the next position.  */
1884                 in_next++;
1885                 cur_node++;
1886
1887                 /* The lowest-cost path to the current position is now known.
1888                  * Finalize the adaptive state that results from taking this
1889                  * lowest-cost path.  */
1890                 struct lzms_item item_to_take = cur_node->item;
1891                 struct lzms_optimum_node *source_node = cur_node - item_to_take.length;
1892                 int next_item_idx = -1;
1893                 for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
1894                         item_to_take = cur_node->extra_items[i];
1895                         source_node -= item_to_take.length;
1896                         next_item_idx++;
1897                 }
1898                 cur_node->state = source_node->state;
1899                 for (;;) {
1900                         const u32 length = item_to_take.length;
1901                         u32 source = item_to_take.source;
1902
1903                         cur_node->state.upcoming_lz_offset = 0;
1904                         cur_node->state.upcoming_delta_pair = 0;
1905                         if (length > 1) {
1906                                 /* Match  */
1907
1908                                 lzms_update_main_state(&cur_node->state, 1);
1909
1910                                 if (source & DELTA_SOURCE_TAG) {
1911                                         /* Delta match  */
1912
1913                                         lzms_update_match_state(&cur_node->state, 1);
1914                                         source &= ~DELTA_SOURCE_TAG;
1915
1916                                         if (source >= LZMS_NUM_DELTA_REPS) {
1917                                                 /* Explicit offset delta match  */
1918                                                 lzms_update_delta_state(&cur_node->state, 0);
1919                                                 cur_node->state.upcoming_delta_pair =
1920                                                         source - (LZMS_NUM_DELTA_REPS - 1);
1921                                         } else {
1922                                                 /* Repeat offset delta match  */
1923                                                 int rep_idx = source;
1924
1925                                                 lzms_update_delta_state(&cur_node->state, 1);
1926                                                 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1927
1928                                                 cur_node->state.upcoming_delta_pair =
1929                                                         cur_node->state.recent_delta_pairs[rep_idx];
1930
1931                                                 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1932                                                         cur_node->state.recent_delta_pairs[i] =
1933                                                                 cur_node->state.recent_delta_pairs[i + 1];
1934                                         }
1935                                 } else {
1936                                         lzms_update_match_state(&cur_node->state, 0);
1937
1938                                         if (source >= LZMS_NUM_LZ_REPS) {
1939                                                 /* Explicit offset LZ match  */
1940                                                 lzms_update_lz_state(&cur_node->state, 0);
1941                                                 cur_node->state.upcoming_lz_offset =
1942                                                         source - (LZMS_NUM_LZ_REPS - 1);
1943                                         } else {
1944                                                 /* Repeat offset LZ match  */
1945                                                 int rep_idx = source;
1946
1947                                                 lzms_update_lz_state(&cur_node->state, 1);
1948                                                 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1949
1950                                                 cur_node->state.upcoming_lz_offset =
1951                                                         cur_node->state.recent_lz_offsets[rep_idx];
1952
1953                                                 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1954                                                         cur_node->state.recent_lz_offsets[i] =
1955                                                                 cur_node->state.recent_lz_offsets[i + 1];
1956                                         }
1957                                 }
1958                         } else {
1959                                 /* Literal  */
1960                                 lzms_update_main_state(&cur_node->state, 0);
1961                         }
1962
1963                         lzms_update_lru_queues(&cur_node->state);
1964
1965                         if (next_item_idx < 0)
1966                                 break;
1967                         if (next_item_idx == 0)
1968                                 item_to_take = cur_node->item;
1969                         else
1970                                 item_to_take = cur_node->extra_items[next_item_idx - 1];
1971                         --next_item_idx;
1972                 }
1973
1974                 /*
1975                  * This loop will terminate when either of the following
1976                  * conditions is true:
1977                  *
1978                  * (1) cur_node == end_node
1979                  *
1980                  *      There are no paths that extend beyond the current
1981                  *      position.  In this case, any path to a later position
1982                  *      must pass through the current position, so we can go
1983                  *      ahead and choose the list of items that led to this
1984                  *      position.
1985                  *
1986                  * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]
1987                  *
1988                  *      This bounds the number of times the algorithm can step
1989                  *      forward before it is guaranteed to start choosing items.
1990                  *      This limits the memory usage.  It also guarantees that
1991                  *      the parser will not go too long without updating the
1992                  *      probability tables.
1993                  *
1994                  * Note: no check for end-of-buffer is needed because
1995                  * end-of-buffer will trigger condition (1).
1996                  */
1997                 if (cur_node == end_node ||
1998                     cur_node == &c->optimum_nodes[NUM_OPTIM_NODES])
1999                 {
2000                         lzms_encode_nonempty_item_list(c, cur_node);
2001                         c->optimum_nodes[0].state = cur_node->state;
2002                         goto begin;
2003                 }
2004         }
2005 }
2006
2007 static void
2008 lzms_init_states_and_probabilities(struct lzms_compressor *c)
2009 {
2010         c->main_state = 0;
2011         c->match_state = 0;
2012         c->lz_state = 0;
2013         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2014                 c->lz_rep_states[i] = 0;
2015         c->delta_state = 0;
2016         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2017                 c->delta_rep_states[i] = 0;
2018
2019         lzms_init_probabilities(&c->probs);
2020 }
2021
2022 static void
2023 lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots)
2024 {
2025         lzms_init_huffman_code(&c->literal_rebuild_info,
2026                                LZMS_NUM_LITERAL_SYMS,
2027                                LZMS_LITERAL_CODE_REBUILD_FREQ,
2028                                c->literal_codewords,
2029                                c->literal_lens,
2030                                c->literal_freqs);
2031
2032         lzms_init_huffman_code(&c->lz_offset_rebuild_info,
2033                                num_offset_slots,
2034                                LZMS_LZ_OFFSET_CODE_REBUILD_FREQ,
2035                                c->lz_offset_codewords,
2036                                c->lz_offset_lens,
2037                                c->lz_offset_freqs);
2038
2039         lzms_init_huffman_code(&c->length_rebuild_info,
2040                                LZMS_NUM_LENGTH_SYMS,
2041                                LZMS_LENGTH_CODE_REBUILD_FREQ,
2042                                c->length_codewords,
2043                                c->length_lens,
2044                                c->length_freqs);
2045
2046         lzms_init_huffman_code(&c->delta_offset_rebuild_info,
2047                                num_offset_slots,
2048                                LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ,
2049                                c->delta_offset_codewords,
2050                                c->delta_offset_lens,
2051                                c->delta_offset_freqs);
2052
2053         lzms_init_huffman_code(&c->delta_power_rebuild_info,
2054                                LZMS_NUM_DELTA_POWER_SYMS,
2055                                LZMS_DELTA_POWER_CODE_REBUILD_FREQ,
2056                                c->delta_power_codewords,
2057                                c->delta_power_lens,
2058                                c->delta_power_freqs);
2059 }
2060
2061 /*
2062  * Flush the output streams, prepare the final compressed data, and return its
2063  * size in bytes.
2064  *
2065  * A return value of 0 indicates that the data could not be compressed to fit in
2066  * the available space.
2067  */
2068 static size_t
2069 lzms_finalize(struct lzms_compressor *c)
2070 {
2071         size_t num_forwards_units;
2072         size_t num_backwards_units;
2073
2074         /* Flush both the forwards and backwards streams, and make sure they
2075          * didn't cross each other and start overwriting each other's data.  */
2076         if (!lzms_output_bitstream_flush(&c->os))
2077                 return 0;
2078
2079         if (!lzms_range_encoder_flush(&c->rc))
2080                 return 0;
2081
2082         if (c->rc.next > c->os.next)
2083                 return 0;
2084
2085         /* Now the compressed buffer contains the data output by the forwards
2086          * bitstream, then empty space, then data output by the backwards
2087          * bitstream.  Move the data output by the backwards bitstream to be
2088          * adjacent to the data output by the forward bitstream, and calculate
2089          * the compressed size that this results in.  */
2090         num_forwards_units = c->rc.next - c->rc.begin;
2091         num_backwards_units = c->rc.end - c->os.next;
2092
2093         memmove(c->rc.next, c->os.next, num_backwards_units * sizeof(le16));
2094
2095         return (num_forwards_units + num_backwards_units) * sizeof(le16);
2096 }
2097
2098 static u64
2099 lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2100                        bool destructive)
2101 {
2102         u64 size = 0;
2103
2104         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2105                 return 0;
2106
2107         size += sizeof(struct lzms_compressor);
2108
2109         if (!destructive)
2110                 size += max_bufsize; /* in_buffer */
2111
2112         /* mf */
2113         size += lcpit_matchfinder_get_needed_memory(max_bufsize);
2114
2115         return size;
2116 }
2117
2118 static int
2119 lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
2120                        bool destructive, void **c_ret)
2121 {
2122         struct lzms_compressor *c;
2123         u32 nice_match_len;
2124
2125         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2126                 return WIMLIB_ERR_INVALID_PARAM;
2127
2128         c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64);
2129         if (!c)
2130                 goto oom0;
2131
2132         c->destructive = destructive;
2133
2134         /* Scale nice_match_len with the compression level.  But to allow an
2135          * optimization for length cost calculations, don't allow nice_match_len
2136          * to exceed MAX_FAST_LENGTH.  */
2137         nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH);
2138
2139         c->use_delta_matches = (compression_level >= 35);
2140         c->try_lzmatch_lit_lzrep0 = (compression_level >= 45);
2141         c->try_lit_lzrep0 = (compression_level >= 60);
2142         c->try_lzrep_lit_lzrep0 = (compression_level >= 60);
2143
2144         if (!c->destructive) {
2145                 c->in_buffer = MALLOC(max_bufsize);
2146                 if (!c->in_buffer)
2147                         goto oom1;
2148         }
2149
2150         if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len))
2151                 goto oom2;
2152
2153         lzms_init_fast_length_slot_tab(c);
2154         lzms_init_offset_slot_tabs(c);
2155
2156         *c_ret = c;
2157         return 0;
2158
2159 oom2:
2160         if (!c->destructive)
2161                 FREE(c->in_buffer);
2162 oom1:
2163         ALIGNED_FREE(c);
2164 oom0:
2165         return WIMLIB_ERR_NOMEM;
2166 }
2167
2168 static size_t
2169 lzms_compress(const void *restrict in, size_t in_nbytes,
2170               void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2171 {
2172         struct lzms_compressor *c = _c;
2173         size_t result;
2174
2175         /* Don't bother trying to compress extremely small inputs.  */
2176         if (in_nbytes < 4)
2177                 return 0;
2178
2179         /* Copy the input data into the internal buffer and preprocess it.  */
2180         if (c->destructive)
2181                 c->in_buffer = (void *)in;
2182         else
2183                 memcpy(c->in_buffer, in, in_nbytes);
2184         c->in_nbytes = in_nbytes;
2185         lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false);
2186
2187         /* Prepare the matchfinders.  */
2188         lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes);
2189         if (c->use_delta_matches)
2190                 lzms_init_delta_matchfinder(c);
2191
2192         /* Initialize the encoder structures.  */
2193         lzms_range_encoder_init(&c->rc, out, out_nbytes_avail / sizeof(le16));
2194         lzms_output_bitstream_init(&c->os, out, out_nbytes_avail / sizeof(le16));
2195         lzms_init_states_and_probabilities(c);
2196         lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes));
2197
2198         /* The main loop: parse and encode.  */
2199         lzms_near_optimal_parse(c);
2200
2201         /* Return the compressed data size or 0.  */
2202         result = lzms_finalize(c);
2203         if (!result && c->destructive)
2204                 lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true);
2205         return result;
2206 }
2207
2208 static void
2209 lzms_free_compressor(void *_c)
2210 {
2211         struct lzms_compressor *c = _c;
2212
2213         if (!c->destructive)
2214                 FREE(c->in_buffer);
2215         lcpit_matchfinder_destroy(&c->mf);
2216         ALIGNED_FREE(c);
2217 }
2218
2219 const struct compressor_ops lzms_compressor_ops = {
2220         .get_needed_memory  = lzms_get_needed_memory,
2221         .create_compressor  = lzms_create_compressor,
2222         .compress           = lzms_compress,
2223         .free_compressor    = lzms_free_compressor,
2224 };