ef67edb213f79fa2c1bf2b1057568a28e93d7fa5
[wimlib] / src / lzms_compress.c
1 /*
2  * lzms_compress.c
3  *
4  * A compressor for the LZMS compression format.
5  */
6
7 /*
8  * Copyright (C) 2013, 2014, 2015 Eric Biggers
9  *
10  * This file is free software; you can redistribute it and/or modify it under
11  * the terms of the GNU Lesser General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option) any
13  * later version.
14  *
15  * This file is distributed in the hope that it will be useful, but WITHOUT
16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this file; if not, see http://www.gnu.org/licenses/.
22  */
23
24 #ifdef HAVE_CONFIG_H
25 #  include "config.h"
26 #endif
27
28 #include <limits.h>
29 #include <string.h>
30
31 #include "wimlib/compress_common.h"
32 #include "wimlib/compressor_ops.h"
33 #include "wimlib/error.h"
34 #include "wimlib/lcpit_matchfinder.h"
35 #include "wimlib/lz_extend.h"
36 #include "wimlib/lz_hash.h"
37 #include "wimlib/lzms_common.h"
38 #include "wimlib/unaligned.h"
39 #include "wimlib/util.h"
40
41 /*
42  * MAX_FAST_LENGTH is the maximum match length for which the length slot can be
43  * looked up directly in 'fast_length_slot_tab' and the length cost can be
44  * looked up directly in 'fast_length_cost_tab'.
45  *
46  * We also limit the 'nice_match_len' parameter to this value.  Consequently, if
47  * the longest match found is shorter than 'nice_match_len', then it must also
48  * be shorter than MAX_FAST_LENGTH.  This makes it possible to do fast lookups
49  * of length costs using 'fast_length_cost_tab' without having to keep checking
50  * whether the length exceeds MAX_FAST_LENGTH or not.
51  */
52 #define MAX_FAST_LENGTH         255
53
54 /* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will
55  * step forward before forcing the pending items to be encoded.  If this value
56  * is increased, then there will be fewer forced flushes, but the probability
57  * entries and Huffman codes will be more likely to become outdated.  */
58 #define NUM_OPTIM_NODES         2048
59
60 /* COST_SHIFT is a scaling factor that makes it possible to consider fractional
61  * bit costs.  A single bit has a cost of (1 << COST_SHIFT).  */
62 #define COST_SHIFT              6
63
64 /* Length of the hash table for finding delta matches  */
65 #define DELTA_HASH_ORDER        17
66 #define DELTA_HASH_LENGTH       ((u32)1 << DELTA_HASH_ORDER)
67
68 /* The number of bytes to hash when finding delta matches; also taken to be the
69  * minimum length of an explicit offset delta match  */
70 #define NBYTES_HASHED_FOR_DELTA 3
71
72 /* The number of delta match powers to consider (must be <=
73  * LZMS_NUM_DELTA_POWER_SYMS)  */
74 #define NUM_POWERS_TO_CONSIDER  6
75
76 /* This structure tracks the state of writing bits as a series of 16-bit coding
77  * units, starting at the end of the output buffer and proceeding backwards.  */
78 struct lzms_output_bitstream {
79
80         /* Bits that haven't yet been written to the output buffer  */
81         u64 bitbuf;
82
83         /* Number of bits currently held in @bitbuf  */
84         unsigned bitcount;
85
86         /* Pointer to one past the next position in the output buffer at which
87          * to output a 16-bit coding unit  */
88         le16 *next;
89
90         /* Pointer to the beginning of the output buffer (this is the "end" when
91          * writing backwards!)  */
92         le16 *begin;
93 };
94
95 /* This structure tracks the state of range encoding and its output, which
96  * starts at the beginning of the output buffer and proceeds forwards.  */
97 struct lzms_range_encoder {
98
99         /* The lower boundary of the current range.  Logically, this is a 33-bit
100          * integer whose high bit is needed to detect carries.  */
101         u64 lower_bound;
102
103         /* The size of the current range  */
104         u32 range_size;
105
106         /* The next 16-bit coding unit to output  */
107         u16 cache;
108
109         /* The number of 16-bit coding units whose output has been delayed due
110          * to possible carrying.  The first such coding unit is @cache; all
111          * subsequent such coding units are 0xffff.  */
112         u32 cache_size;
113
114         /* Pointer to the beginning of the output buffer  */
115         le16 *begin;
116
117         /* Pointer to the position in the output buffer at which the next coding
118          * unit must be written  */
119         le16 *next;
120
121         /* Pointer to just past the end of the output buffer  */
122         le16 *end;
123 };
124
125 /* Bookkeeping information for an adaptive Huffman code  */
126 struct lzms_huffman_rebuild_info {
127
128         /* The remaining number of symbols to encode until this code must be
129          * rebuilt  */
130         unsigned num_syms_until_rebuild;
131
132         /* The number of symbols in this code  */
133         unsigned num_syms;
134
135         /* The rebuild frequency of this code, in symbols  */
136         unsigned rebuild_freq;
137
138         /* The Huffman codeword of each symbol in this code  */
139         u32 *codewords;
140
141         /* The length of each Huffman codeword, in bits  */
142         u8 *lens;
143
144         /* The frequency of each symbol in this code  */
145         u32 *freqs;
146 };
147
148 /*
149  * The compressor-internal representation of a match or literal.
150  *
151  * Literals have length=1; matches have length > 1.  (We disallow matches of
152  * length 1, even though this is a valid length in LZMS.)
153  *
154  * The source is encoded as follows:
155  *
156  * - Literals: the literal byte itself
157  * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1)
158  * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets
159  * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are
160  *   the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1)
161  * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is
162  *   the index of the (power, raw_offset) pair in recent_delta_pairs
163  */
164 struct lzms_item {
165         u32 length;
166         u32 source;
167 };
168
169 #define DELTA_SOURCE_TAG                ((u32)1 << 31)
170 #define DELTA_SOURCE_POWER_SHIFT        28
171 #define DELTA_SOURCE_RAW_OFFSET_MASK    (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1)
172
173 static inline void
174 check_that_powers_fit_in_bitfield(void)
175 {
176         STATIC_ASSERT(LZMS_NUM_DELTA_POWER_SYMS <= (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
177 }
178
179 /* A stripped-down version of the adaptive state in LZMS which excludes the
180  * probability entries and Huffman codes  */
181 struct lzms_adaptive_state {
182
183         /* Recent offsets for LZ matches  */
184         u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1];
185         u32 prev_lz_offset; /* 0 means none */
186         u32 upcoming_lz_offset; /* 0 means none */
187
188         /* Recent (power, raw offset) pairs for delta matches.
189          * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw
190          * offset, and the high bits are the power.  */
191         u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1];
192         u32 prev_delta_pair; /* 0 means none */
193         u32 upcoming_delta_pair; /* 0 means none  */
194
195         /* States for predicting the probabilities of item types  */
196         u8 main_state;
197         u8 match_state;
198         u8 lz_state;
199         u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
200         u8 delta_state;
201         u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
202 } _aligned_attribute(64);
203
204 /*
205  * This structure represents a byte position in the preprocessed input data and
206  * a node in the graph of possible match/literal choices.
207  *
208  * Logically, each incoming edge to this node is labeled with a literal or a
209  * match that can be taken to reach this position from an earlier position; and
210  * each outgoing edge from this node is labeled with a literal or a match that
211  * can be taken to advance from this position to a later position.
212  */
213 struct lzms_optimum_node {
214
215         /*
216          * The cost of the lowest-cost path that has been found to reach this
217          * position.  This can change as progressively lower cost paths are
218          * found to reach this position.
219          */
220         u32 cost;
221 #define INFINITE_COST UINT32_MAX
222
223         /*
224          * @item is the last item that was taken to reach this position to reach
225          * it with the stored @cost.  This can change as progressively lower
226          * cost paths are found to reach this position.
227          *
228          * In some cases we look ahead more than one item.  If we looked ahead n
229          * items to reach this position, then @item is the last item taken,
230          * @extra_items contains the other items ordered from second-to-last to
231          * first, and @num_extra_items is n - 1.
232          */
233         unsigned num_extra_items;
234         struct lzms_item item;
235         struct lzms_item extra_items[2];
236
237         /*
238          * The adaptive state that exists at this position.  This is filled in
239          * lazily, only after the minimum-cost path to this position is found.
240          *
241          * Note: the way the algorithm handles this adaptive state in the
242          * "minimum-cost" parse is actually only an approximation.  It's
243          * possible for the globally optimal, minimum cost path to contain a
244          * prefix, ending at a position, where that path prefix is *not* the
245          * minimum cost path to that position.  This can happen if such a path
246          * prefix results in a different adaptive state which results in lower
247          * costs later.  Although the algorithm does do some heuristic
248          * multi-item lookaheads, it does not solve this problem in general.
249          *
250          * Note: this adaptive state structure also does not include the
251          * probability entries or current Huffman codewords.  Those aren't
252          * maintained per-position and are only updated occassionally.
253          */
254         struct lzms_adaptive_state state;
255 } _aligned_attribute(64);
256
257 /* The main compressor structure  */
258 struct lzms_compressor {
259
260         /* The matchfinder for LZ matches  */
261         struct lcpit_matchfinder mf;
262
263         /* The preprocessed buffer of data being compressed  */
264         u8 *in_buffer;
265
266         /* The number of bytes of data to be compressed, which is the number of
267          * bytes of data in @in_buffer that are actually valid  */
268         size_t in_nbytes;
269
270         /*
271          * Boolean flags to enable consideration of various types of multi-step
272          * operations during parsing.
273          *
274          * Among other cases, multi-step operations can help with gaps where two
275          * matches are separated by a non-matching byte.
276          *
277          * This idea is borrowed from Igor Pavlov's LZMA encoder.
278          */
279         bool try_lit_lzrep0;
280         bool try_lzrep_lit_lzrep0;
281         bool try_lzmatch_lit_lzrep0;
282
283         /*
284          * If true, the compressor can use delta matches.  This slows down
285          * compression.  It improves the compression ratio greatly, slightly, or
286          * not at all, depending on the input data.
287          */
288         bool use_delta_matches;
289
290         /* If true, the compressor need not preserve the input buffer if it
291          * compresses the data successfully.  */
292         bool destructive;
293
294         /* 'last_target_usages' is a large array that is only needed for
295          * preprocessing, so it is in union with fields that don't need to be
296          * initialized until after preprocessing.  */
297         union {
298         struct {
299
300         /* Temporary space to store matches found by the LZ matchfinder  */
301         struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1];
302
303         /* Hash table for finding delta matches  */
304         u32 delta_hash_table[DELTA_HASH_LENGTH];
305
306         /* For each delta power, the hash code for the next sequence  */
307         u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
308
309         /* The per-byte graph nodes for near-optimal parsing  */
310         struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH +
311                                                1 + MAX_FAST_LENGTH];
312
313         /* Table: length => current cost for small match lengths  */
314         u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
315
316         /* Range encoder which outputs to the beginning of the compressed data
317          * buffer, proceeding forwards  */
318         struct lzms_range_encoder rc;
319
320         /* Bitstream which outputs to the end of the compressed data buffer,
321          * proceeding backwards  */
322         struct lzms_output_bitstream os;
323
324         /* States and probability entries for item type disambiguation  */
325         unsigned main_state;
326         unsigned match_state;
327         unsigned lz_state;
328         unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
329         unsigned delta_state;
330         unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
331         struct lzms_probabilites probs;
332
333         /* Huffman codes  */
334
335         struct lzms_huffman_rebuild_info literal_rebuild_info;
336         u32 literal_codewords[LZMS_NUM_LITERAL_SYMS];
337         u8 literal_lens[LZMS_NUM_LITERAL_SYMS];
338         u32 literal_freqs[LZMS_NUM_LITERAL_SYMS];
339
340         struct lzms_huffman_rebuild_info lz_offset_rebuild_info;
341         u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
342         u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
343         u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
344
345         struct lzms_huffman_rebuild_info length_rebuild_info;
346         u32 length_codewords[LZMS_NUM_LENGTH_SYMS];
347         u8 length_lens[LZMS_NUM_LENGTH_SYMS];
348         u32 length_freqs[LZMS_NUM_LENGTH_SYMS];
349
350         struct lzms_huffman_rebuild_info delta_offset_rebuild_info;
351         u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
352         u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
353         u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
354
355         struct lzms_huffman_rebuild_info delta_power_rebuild_info;
356         u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS];
357         u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS];
358         u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS];
359
360         }; /* struct */
361
362         s32 last_target_usages[65536];
363
364         }; /* union */
365
366         /* Table: length => length slot for small match lengths  */
367         u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1];
368
369         /* Tables for mapping offsets to offset slots  */
370
371         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
372         u8 offset_slot_tab_1[0xe4a5];
373
374         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
375         u16 offset_slot_tab_2[0x3d0000 >> 11];
376
377         /* slots [427, 799); 16 <= num_extra_bits  */
378         u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16];
379 };
380
381 /******************************************************************************
382  *                   Offset and length slot acceleration                      *
383  ******************************************************************************/
384
385 /* Generate the acceleration table for length slots.  */
386 static void
387 lzms_init_fast_length_slot_tab(struct lzms_compressor *c)
388 {
389         unsigned slot = 0;
390         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
391                 if (len >= lzms_length_slot_base[slot + 1])
392                         slot++;
393                 c->fast_length_slot_tab[len] = slot;
394         }
395 }
396
397 /* Generate the acceleration tables for offset slots.  */
398 static void
399 lzms_init_offset_slot_tabs(struct lzms_compressor *c)
400 {
401         u32 offset;
402         unsigned slot = 0;
403
404         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
405         for (offset = 1; offset < 0xe4a5; offset++) {
406                 if (offset >= lzms_offset_slot_base[slot + 1])
407                         slot++;
408                 c->offset_slot_tab_1[offset] = slot;
409         }
410
411         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
412         for (; offset < 0x3de4a5; offset += (u32)1 << 11) {
413                 if (offset >= lzms_offset_slot_base[slot + 1])
414                         slot++;
415                 c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot;
416         }
417
418         /* slots [427, 799); 16 <= num_extra_bits  */
419         for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) {
420                 if (offset >= lzms_offset_slot_base[slot + 1])
421                         slot++;
422                 c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot;
423         }
424 }
425
426 /*
427  * Return the length slot for the specified match length, using the compressor's
428  * acceleration table if the length is small enough.
429  */
430 static inline unsigned
431 lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
432 {
433         if (likely(length <= MAX_FAST_LENGTH))
434                 return c->fast_length_slot_tab[length];
435         return lzms_get_length_slot(length);
436 }
437
438 /*
439  * Return the offset slot for the specified match offset, using the compressor's
440  * acceleration tables to speed up the mapping.
441  */
442 static inline unsigned
443 lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
444 {
445         if (offset < 0xe4a5)
446                 return c->offset_slot_tab_1[offset];
447         offset -= 0xe4a5;
448         if (offset < 0x3d0000)
449                 return c->offset_slot_tab_2[offset >> 11];
450         return c->offset_slot_tab_3[offset >> 16];
451 }
452
453 /******************************************************************************
454  *                             Range encoding                                 *
455  ******************************************************************************/
456
457 /*
458  * Initialize the range encoder @rc to write forwards to the specified buffer
459  * @out that is @count 16-bit integers long.
460  */
461 static void
462 lzms_range_encoder_init(struct lzms_range_encoder *rc, le16 *out, size_t count)
463 {
464         rc->lower_bound = 0;
465         rc->range_size = 0xffffffff;
466         rc->cache = 0;
467         rc->cache_size = 1;
468         rc->begin = out;
469         rc->next = out - 1;
470         rc->end = out + count;
471 }
472
473 /*
474  * Attempt to flush bits from the range encoder.
475  *
476  * The basic idea is that we're writing bits from @rc->lower_bound to the
477  * output.  However, due to carrying, the writing of coding units with the
478  * maximum value, as well as one prior coding unit, must be delayed until it is
479  * determined whether a carry is needed.
480  *
481  * This is based on the public domain code for LZMA written by Igor Pavlov, but
482  * with the following differences:
483  *
484  *      - In LZMS, 16-bit coding units are required rather than 8-bit.
485  *
486  *      - In LZMS, the first coding unit is not ignored by the decompressor, so
487  *        the encoder cannot output a dummy value to that position.
488  */
489 static void
490 lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
491 {
492         if ((u32)(rc->lower_bound) < 0xffff0000 ||
493             (u32)(rc->lower_bound >> 32) != 0)
494         {
495                 /* Carry not needed (rc->lower_bound < 0xffff0000), or carry
496                  * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit
497                  * is 1).  */
498                 do {
499                         if (likely(rc->next >= rc->begin)) {
500                                 if (rc->next != rc->end) {
501                                         put_unaligned_u16_le(rc->cache +
502                                                              (u16)(rc->lower_bound >> 32),
503                                                              rc->next++);
504                                 }
505                         } else {
506                                 rc->next++;
507                         }
508                         rc->cache = 0xffff;
509                 } while (--rc->cache_size != 0);
510
511                 rc->cache = (rc->lower_bound >> 16) & 0xffff;
512         }
513         ++rc->cache_size;
514         rc->lower_bound = (rc->lower_bound & 0xffff) << 16;
515 }
516
517 static bool
518 lzms_range_encoder_flush(struct lzms_range_encoder *rc)
519 {
520         for (int i = 0; i < 4; i++)
521                 lzms_range_encoder_shift_low(rc);
522         return rc->next != rc->end;
523 }
524
525 /*
526  * Encode the next bit using the range encoder.
527  *
528  * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
529  * bit is 0 rather than 1.
530  */
531 static inline void
532 lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
533 {
534         /* Normalize if needed.  */
535         if (rc->range_size <= 0xffff) {
536                 rc->range_size <<= 16;
537                 lzms_range_encoder_shift_low(rc);
538         }
539
540         u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob;
541         if (bit == 0) {
542                 rc->range_size = bound;
543         } else {
544                 rc->lower_bound += bound;
545                 rc->range_size -= bound;
546         }
547 }
548
549 /*
550  * Encode a bit.  This wraps around lzms_range_encode_bit() to handle using and
551  * updating the state and its corresponding probability entry.
552  */
553 static inline void
554 lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
555                 struct lzms_probability_entry *probs,
556                 struct lzms_range_encoder *rc)
557 {
558         struct lzms_probability_entry *prob_entry;
559         u32 prob;
560
561         /* Load the probability entry for the current state.  */
562         prob_entry = &probs[*state_p];
563
564         /* Update the state based on the next bit.  */
565         *state_p = ((*state_p << 1) | bit) & (num_states - 1);
566
567         /* Get the probability that the bit is 0.  */
568         prob = lzms_get_probability(prob_entry);
569
570         /* Update the probability entry.  */
571         lzms_update_probability_entry(prob_entry, bit);
572
573         /* Encode the bit using the range encoder.  */
574         lzms_range_encode_bit(rc, bit, prob);
575 }
576
577 /* Helper functions for encoding bits in the various decision classes  */
578
579 static void
580 lzms_encode_main_bit(struct lzms_compressor *c, int bit)
581 {
582         lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
583                         c->probs.main, &c->rc);
584 }
585
586 static void
587 lzms_encode_match_bit(struct lzms_compressor *c, int bit)
588 {
589         lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
590                         c->probs.match, &c->rc);
591 }
592
593 static void
594 lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
595 {
596         lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
597                         c->probs.lz, &c->rc);
598 }
599
600 static void
601 lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
602 {
603         lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
604                         c->probs.lz_rep[idx], &c->rc);
605 }
606
607 static void
608 lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
609 {
610         lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
611                         c->probs.delta, &c->rc);
612 }
613
614 static void
615 lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
616 {
617         lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
618                         c->probs.delta_rep[idx], &c->rc);
619 }
620
621 /******************************************************************************
622  *                   Huffman encoding and verbatim bits                       *
623  ******************************************************************************/
624
625 /*
626  * Initialize the output bitstream @os to write backwards to the specified
627  * buffer @out that is @count 16-bit integers long.
628  */
629 static void
630 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
631                            le16 *out, size_t count)
632 {
633         os->bitbuf = 0;
634         os->bitcount = 0;
635         os->next = out + count;
636         os->begin = out;
637 }
638
639 /*
640  * Write some bits, contained in the low-order @num_bits bits of @bits, to the
641  * output bitstream @os.
642  *
643  * @max_num_bits is a compile-time constant that specifies the maximum number of
644  * bits that can ever be written at this call site.
645  */
646 static inline void
647 lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
648                 const unsigned num_bits, const unsigned max_num_bits)
649 {
650         /* Add the bits to the bit buffer variable.  */
651         os->bitcount += num_bits;
652         os->bitbuf = (os->bitbuf << num_bits) | bits;
653
654         /* Check whether any coding units need to be written.  */
655         while (os->bitcount >= 16) {
656
657                 os->bitcount -= 16;
658
659                 /* Write a coding unit, unless it would underflow the buffer. */
660                 if (os->next != os->begin)
661                         put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next);
662
663                 /* Optimization for call sites that never write more than 16
664                  * bits at once.  */
665                 if (max_num_bits <= 16)
666                         break;
667         }
668 }
669
670 /*
671  * Flush the output bitstream, ensuring that all bits written to it have been
672  * written to memory.  Return %true if all bits have been output successfully,
673  * or %false if an overrun occurred.
674  */
675 static bool
676 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
677 {
678         if (os->next == os->begin)
679                 return false;
680
681         if (os->bitcount != 0)
682                 put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next);
683
684         return true;
685 }
686
687 static void
688 lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
689 {
690         make_canonical_huffman_code(rebuild_info->num_syms,
691                                     LZMS_MAX_CODEWORD_LENGTH,
692                                     rebuild_info->freqs,
693                                     rebuild_info->lens,
694                                     rebuild_info->codewords);
695         rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq;
696 }
697
698 static void
699 lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info,
700                        unsigned num_syms, unsigned rebuild_freq,
701                        u32 *codewords, u8 *lens, u32 *freqs)
702 {
703         rebuild_info->num_syms = num_syms;
704         rebuild_info->rebuild_freq = rebuild_freq;
705         rebuild_info->codewords = codewords;
706         rebuild_info->lens = lens;
707         rebuild_info->freqs = freqs;
708         lzms_init_symbol_frequencies(freqs, num_syms);
709         lzms_build_huffman_code(rebuild_info);
710 }
711
712 static void
713 lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
714 {
715         lzms_build_huffman_code(rebuild_info);
716         lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms);
717 }
718
719 /*
720  * Encode a symbol using the specified Huffman code.  Then, if the Huffman code
721  * needs to be rebuilt, rebuild it and return true; otherwise return false.
722  */
723 static inline bool
724 lzms_huffman_encode_symbol(unsigned sym,
725                            const u32 *codewords, const u8 *lens, u32 *freqs,
726                            struct lzms_output_bitstream *os,
727                            struct lzms_huffman_rebuild_info *rebuild_info)
728 {
729         lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH);
730         ++freqs[sym];
731         if (--rebuild_info->num_syms_until_rebuild == 0) {
732                 lzms_rebuild_huffman_code(rebuild_info);
733                 return true;
734         }
735         return false;
736 }
737
738 /* Helper routines to encode symbols using the various Huffman codes  */
739
740 static bool
741 lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym)
742 {
743         return lzms_huffman_encode_symbol(sym, c->literal_codewords,
744                                           c->literal_lens, c->literal_freqs,
745                                           &c->os, &c->literal_rebuild_info);
746 }
747
748 static bool
749 lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym)
750 {
751         return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords,
752                                           c->lz_offset_lens, c->lz_offset_freqs,
753                                           &c->os, &c->lz_offset_rebuild_info);
754 }
755
756 static bool
757 lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym)
758 {
759         return lzms_huffman_encode_symbol(sym, c->length_codewords,
760                                           c->length_lens, c->length_freqs,
761                                           &c->os, &c->length_rebuild_info);
762 }
763
764 static bool
765 lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym)
766 {
767         return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords,
768                                           c->delta_offset_lens, c->delta_offset_freqs,
769                                           &c->os, &c->delta_offset_rebuild_info);
770 }
771
772 static bool
773 lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym)
774 {
775         return lzms_huffman_encode_symbol(sym, c->delta_power_codewords,
776                                           c->delta_power_lens, c->delta_power_freqs,
777                                           &c->os, &c->delta_power_rebuild_info);
778 }
779
780 static void
781 lzms_update_fast_length_costs(struct lzms_compressor *c);
782
783 /*
784  * Encode a match length.  If this causes the Huffman code for length symbols to
785  * be rebuilt, also update the length costs array used by the parser.
786  */
787 static void
788 lzms_encode_length(struct lzms_compressor *c, u32 length)
789 {
790         unsigned slot = lzms_comp_get_length_slot(c, length);
791
792         if (lzms_encode_length_symbol(c, slot))
793                 lzms_update_fast_length_costs(c);
794
795         lzms_write_bits(&c->os, length - lzms_length_slot_base[slot],
796                         lzms_extra_length_bits[slot],
797                         LZMS_MAX_EXTRA_LENGTH_BITS);
798 }
799
800 /* Encode the offset of an LZ match.  */
801 static void
802 lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset)
803 {
804         unsigned slot = lzms_comp_get_offset_slot(c, offset);
805
806         lzms_encode_lz_offset_symbol(c, slot);
807         lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot],
808                         lzms_extra_offset_bits[slot],
809                         LZMS_MAX_EXTRA_OFFSET_BITS);
810 }
811
812 /* Encode the raw offset of a delta match.  */
813 static void
814 lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset)
815 {
816         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
817
818         lzms_encode_delta_offset_symbol(c, slot);
819         lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot],
820                         lzms_extra_offset_bits[slot],
821                         LZMS_MAX_EXTRA_OFFSET_BITS);
822 }
823
824 /******************************************************************************
825  *                             Item encoding                                  *
826  ******************************************************************************/
827
828 /* Encode the specified item, which may be a literal or any type of match.  */
829 static void
830 lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source)
831 {
832         /* Main bit: 0 = literal, 1 = match  */
833         int main_bit = (length > 1);
834         lzms_encode_main_bit(c, main_bit);
835
836         if (!main_bit) {
837                 /* Literal  */
838                 unsigned literal = source;
839                 lzms_encode_literal_symbol(c, literal);
840         } else {
841                 /* Match  */
842
843                 /* Match bit: 0 = LZ match, 1 = delta match  */
844                 int match_bit = (source & DELTA_SOURCE_TAG) != 0;
845                 lzms_encode_match_bit(c, match_bit);
846
847                 if (!match_bit) {
848                         /* LZ match  */
849
850                         /* LZ bit: 0 = explicit offset, 1 = repeat offset  */
851                         int lz_bit = (source < LZMS_NUM_LZ_REPS);
852                         lzms_encode_lz_bit(c, lz_bit);
853
854                         if (!lz_bit) {
855                                 /* Explicit offset LZ match  */
856                                 u32 offset = source - (LZMS_NUM_LZ_REPS - 1);
857                                 lzms_encode_lz_offset(c, offset);
858                         } else {
859                                 /* Repeat offset LZ match  */
860                                 int rep_idx = source;
861                                 for (int i = 0; i < rep_idx; i++)
862                                         lzms_encode_lz_rep_bit(c, 1, i);
863                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
864                                         lzms_encode_lz_rep_bit(c, 0, rep_idx);
865                         }
866                 } else {
867                         /* Delta match  */
868
869                         source &= ~DELTA_SOURCE_TAG;
870
871                         /* Delta bit: 0 = explicit offset, 1 = repeat offset  */
872                         int delta_bit = (source < LZMS_NUM_DELTA_REPS);
873                         lzms_encode_delta_bit(c, delta_bit);
874
875                         if (!delta_bit) {
876                                 /* Explicit offset delta match  */
877                                 u32 power = source >> DELTA_SOURCE_POWER_SHIFT;
878                                 u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) -
879                                                  (LZMS_NUM_DELTA_REPS - 1);
880                                 lzms_encode_delta_power_symbol(c, power);
881                                 lzms_encode_delta_raw_offset(c, raw_offset);
882                         } else {
883                                 /* Repeat offset delta match  */
884                                 int rep_idx = source;
885                                 for (int i = 0; i < rep_idx; i++)
886                                         lzms_encode_delta_rep_bit(c, 1, i);
887                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
888                                         lzms_encode_delta_rep_bit(c, 0, rep_idx);
889                         }
890                 }
891
892                 /* Match length (encoded the same way for any match type)  */
893                 lzms_encode_length(c, length);
894         }
895 }
896
897 /* Encode a list of matches and literals chosen by the parsing algorithm.  */
898 static void
899 lzms_encode_nonempty_item_list(struct lzms_compressor *c,
900                                struct lzms_optimum_node *end_node)
901 {
902         /* Since we've stored at each node the item we took to arrive at that
903          * node, we can trace our chosen path in backwards order.  However, for
904          * encoding we need to trace our chosen path in forwards order.  To make
905          * this possible, the following loop moves the items from their
906          * destination nodes to their source nodes, which effectively reverses
907          * the path.  (Think of it like reversing a singly-linked list.)  */
908         struct lzms_optimum_node *cur_node = end_node;
909         struct lzms_item saved_item = cur_node->item;
910         do {
911                 struct lzms_item item = saved_item;
912                 if (cur_node->num_extra_items > 0) {
913                         /* Handle an arrival via multi-item lookahead.  */
914                         unsigned i = 0;
915                         struct lzms_optimum_node *orig_node = cur_node;
916                         do {
917                                 cur_node -= item.length;
918                                 cur_node->item = item;
919                                 item = orig_node->extra_items[i];
920                         } while (++i != orig_node->num_extra_items);
921                 }
922                 cur_node -= item.length;
923                 saved_item = cur_node->item;
924                 cur_node->item = item;
925         } while (cur_node != c->optimum_nodes);
926
927         /* Now trace the chosen path in forwards order, encoding each item.  */
928         do {
929                 lzms_encode_item(c, cur_node->item.length, cur_node->item.source);
930                 cur_node += cur_node->item.length;
931         } while (cur_node != end_node);
932 }
933
934 static inline void
935 lzms_encode_item_list(struct lzms_compressor *c,
936                       struct lzms_optimum_node *end_node)
937 {
938         if (end_node != c->optimum_nodes)
939                 lzms_encode_nonempty_item_list(c, end_node);
940 }
941
942 /******************************************************************************
943  *                             Cost evaluation                                *
944  ******************************************************************************/
945
946 /*
947  * If p is the predicted probability of the next bit being a 0, then the number
948  * of bits required to encode a 0 bit using a binary range encoder is the real
949  * number -log2(p), and the number of bits required to encode a 1 bit is the
950  * real number -log2(1 - p).  To avoid computing either of these expressions at
951  * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from
952  * probability to cost for each possible probability.  Specifically, the array
953  * indices are the numerators of the possible probabilities in LZMS, where the
954  * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the
955  * bit costs multiplied by 1<<COST_SHIFT and rounded to the nearest integer.
956  * Furthermore, the values stored for 0% and 100% probabilities are equal to the
957  * adjacent values, since these probabilities are not actually permitted.  This
958  * allows us to use the num_recent_zero_bits value from the
959  * lzms_probability_entry as the array index without fixing up these two special
960  * cases.
961  */
962 static const u32 lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR + 1] = {
963         384, 384, 320, 283, 256, 235, 219, 204,
964         192, 181, 171, 163, 155, 147, 140, 134,
965         128, 122, 117, 112, 107, 103, 99,  94,
966         91,  87,  83,  80,  76,  73,  70,  67,
967         64,  61,  58,  56,  53,  51,  48,  46,
968         43,  41,  39,  37,  35,  33,  30,  29,
969         27,  25,  23,  21,  19,  17,  16,  14,
970         12,  11,  9,   8,   6,   4,   3,   1,
971         1
972 };
973
974 static inline void
975 check_cost_shift(void)
976 {
977         /* lzms_bit_costs is hard-coded to the current COST_SHIFT.  */
978         STATIC_ASSERT(COST_SHIFT == 6);
979 }
980
981 #if 0
982 #include <math.h>
983
984 static void
985 lzms_compute_bit_costs(void)
986 {
987         for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) {
988                 u32 prob = i;
989                 if (prob == 0)
990                         prob++;
991                 else if (prob == LZMS_PROBABILITY_DENOMINATOR)
992                         prob--;
993
994                 lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) *
995                                           (1 << COST_SHIFT));
996         }
997 }
998 #endif
999
1000 /* Return the cost to encode a 0 bit in the specified context.  */
1001 static inline u32
1002 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
1003 {
1004         return lzms_bit_costs[probs[state].num_recent_zero_bits];
1005 }
1006
1007 /* Return the cost to encode a 1 bit in the specified context.  */
1008 static inline u32
1009 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
1010 {
1011         return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
1012                               probs[state].num_recent_zero_bits];
1013 }
1014
1015 /* Return the cost to encode a literal, including the main bit.  */
1016 static inline u32
1017 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
1018 {
1019         return lzms_bit_0_cost(main_state, c->probs.main) +
1020                 ((u32)c->literal_lens[literal] << COST_SHIFT);
1021 }
1022
1023 /* Update 'fast_length_cost_tab' to use the latest Huffman code.  */
1024 static void
1025 lzms_update_fast_length_costs(struct lzms_compressor *c)
1026 {
1027         int slot = -1;
1028         u32 cost = 0;
1029         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
1030                 if (len >= lzms_length_slot_base[slot + 1]) {
1031                         slot++;
1032                         cost = (u32)(c->length_lens[slot] +
1033                                      lzms_extra_length_bits[slot]) << COST_SHIFT;
1034                 }
1035                 c->fast_length_cost_tab[len] = cost;
1036         }
1037 }
1038
1039 /* Return the cost to encode the specified match length, which must not exceed
1040  * MAX_FAST_LENGTH.  */
1041 static inline u32
1042 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
1043 {
1044         return c->fast_length_cost_tab[length];
1045 }
1046
1047 /* Return the cost to encode the specified LZ match offset.  */
1048 static inline u32
1049 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
1050 {
1051         unsigned slot = lzms_comp_get_offset_slot(c, offset);
1052         u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot];
1053         return num_bits << COST_SHIFT;
1054 }
1055
1056 /* Return the cost to encode the specified delta power and raw offset.  */
1057 static inline u32
1058 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
1059 {
1060         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
1061         u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] +
1062                        lzms_extra_offset_bits[slot];
1063         return num_bits << COST_SHIFT;
1064 }
1065
1066 /******************************************************************************
1067  *                              Adaptive state                                *
1068  ******************************************************************************/
1069
1070 static void
1071 lzms_init_adaptive_state(struct lzms_adaptive_state *state)
1072 {
1073         for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++)
1074                 state->recent_lz_offsets[i] = i + 1;
1075         state->prev_lz_offset = 0;
1076         state->upcoming_lz_offset = 0;
1077
1078         for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++)
1079                 state->recent_delta_pairs[i] = i + 1;
1080         state->prev_delta_pair = 0;
1081         state->upcoming_delta_pair = 0;
1082
1083         state->main_state = 0;
1084         state->match_state = 0;
1085         state->lz_state = 0;
1086         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
1087                 state->lz_rep_states[i] = 0;
1088         state->delta_state = 0;
1089         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
1090                 state->delta_rep_states[i] = 0;
1091 }
1092
1093 /*
1094  * Update the LRU queues for match sources when advancing by one item.
1095  *
1096  * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more
1097  * complex because:
1098  *      - there are separate queues for LZ and delta matches
1099  *      - updates to the queues are delayed by one encoded item (this prevents
1100  *        sources from being bumped up to index 0 too early)
1101  */
1102 static void
1103 lzms_update_lru_queues(struct lzms_adaptive_state *state)
1104 {
1105         if (state->prev_lz_offset != 0) {
1106                 for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--)
1107                         state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i];
1108                 state->recent_lz_offsets[0] = state->prev_lz_offset;
1109         }
1110         state->prev_lz_offset = state->upcoming_lz_offset;
1111
1112         if (state->prev_delta_pair != 0) {
1113                 for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--)
1114                         state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i];
1115                 state->recent_delta_pairs[0] = state->prev_delta_pair;
1116         }
1117         state->prev_delta_pair = state->upcoming_delta_pair;
1118 }
1119
1120 static inline void
1121 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
1122 {
1123         *state_p = ((*state_p << 1) | bit) & (num_states - 1);
1124 }
1125
1126 static inline void
1127 lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
1128 {
1129         lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
1130 }
1131
1132 static inline void
1133 lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
1134 {
1135         lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
1136 }
1137
1138 static inline void
1139 lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
1140 {
1141         lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
1142 }
1143
1144 static inline void
1145 lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1146 {
1147         for (int i = 0; i < rep_idx; i++)
1148                 lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS);
1149
1150         if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1151                 lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
1152 }
1153
1154 static inline void
1155 lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
1156 {
1157         lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
1158 }
1159
1160 static inline void
1161 lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1162 {
1163         for (int i = 0; i < rep_idx; i++)
1164                 lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS);
1165
1166         if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1167                 lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS);
1168 }
1169
1170 /******************************************************************************
1171  *                              Matchfinding                                  *
1172  ******************************************************************************/
1173
1174 /* Note: this code just handles finding delta matches.  The code for finding LZ
1175  * matches is elsewhere.  */
1176
1177
1178 /* Initialize the delta matchfinder for a new input buffer.  */
1179 static void
1180 lzms_init_delta_matchfinder(struct lzms_compressor *c)
1181 {
1182         /* Set all entries to use an invalid power, which will never match.  */
1183         STATIC_ASSERT(NUM_POWERS_TO_CONSIDER < (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
1184         memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
1185
1186         /* Initialize the next hash code for each power.  We can just use zeroes
1187          * initially; it doesn't really matter.  */
1188         for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++)
1189                 c->next_delta_hashes[i] = 0;
1190 }
1191
1192 /*
1193  * Compute a DELTA_HASH_ORDER-bit hash code for the first
1194  * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
1195  * delta context with the specified @span.
1196  */
1197 static inline u32
1198 lzms_delta_hash(const u8 *p, const u32 pos, u32 span)
1199 {
1200         /* A delta match has a certain span and an offset that is a multiple of
1201          * that span.  To reduce wasted space we use a single combined hash
1202          * table for all spans and positions, but to minimize collisions we
1203          * include in the hash code computation the span and the low-order bits
1204          * of the current position.  */
1205
1206         STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1207         u8 d0 = *(p + 0) - *(p + 0 - span);
1208         u8 d1 = *(p + 1) - *(p + 1 - span);
1209         u8 d2 = *(p + 2) - *(p + 2 - span);
1210         u32 v = ((span + (pos & (span - 1))) << 24) |
1211                 ((u32)d2 << 16) | ((u32)d1 << 8) | d0;
1212         return lz_hash(v, DELTA_HASH_ORDER);
1213 }
1214
1215 /*
1216  * Given a match between @in_next and @matchptr in a delta context with the
1217  * specified @span and having the initial @len, extend the match as far as
1218  * possible, up to a limit of @max_len.
1219  */
1220 static inline u32
1221 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
1222                         u32 len, u32 max_len, u32 span)
1223 {
1224         while (len < max_len &&
1225                (u8)(*(in_next + len) - *(in_next + len - span)) ==
1226                (u8)(*(matchptr + len) - *(matchptr + len - span)))
1227         {
1228                 len++;
1229         }
1230         return len;
1231 }
1232
1233 static void
1234 lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
1235                                   const u8 *in_next, u32 count)
1236 {
1237         u32 pos = in_next - c->in_buffer;
1238         if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1))
1239                 return;
1240         do {
1241                 /* Update the hash table for each power.  */
1242                 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1243                         const u32 span = (u32)1 << power;
1244                         if (unlikely(pos < span))
1245                                 continue;
1246                         const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1247                         const u32 hash = c->next_delta_hashes[power];
1248                         c->delta_hash_table[hash] =
1249                                 (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1250                         c->next_delta_hashes[power] = next_hash;
1251                         prefetchw(&c->delta_hash_table[next_hash]);
1252                 }
1253         } while (in_next++, pos++, --count);
1254 }
1255
1256 /*
1257  * Skip the next @count bytes (don't search for matches at them).  @in_next
1258  * points to the first byte to skip.  The return value is @in_next + count.
1259  */
1260 static const u8 *
1261 lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
1262 {
1263         lcpit_matchfinder_skip_bytes(&c->mf, count);
1264         if (c->use_delta_matches)
1265                 lzms_delta_matchfinder_skip_bytes(c, in_next, count);
1266         return in_next + count;
1267 }
1268
1269 /******************************************************************************
1270  *                          "Near-optimal" parsing                            *
1271  ******************************************************************************/
1272
1273 /*
1274  * The main near-optimal parsing routine.
1275  *
1276  * Briefly, the algorithm does an approximate minimum-cost path search to find a
1277  * "near-optimal" sequence of matches and literals to output, based on the
1278  * current cost model.  The algorithm steps forward, position by position (byte
1279  * by byte), and updates the minimum cost path to reach each later position that
1280  * can be reached using a match or literal from the current position.  This is
1281  * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
1282  * the graph edges are possible matches/literals to code, and the cost of each
1283  * edge is the estimated number of bits (scaled up by COST_SHIFT) that will be
1284  * required to output the corresponding match or literal.  But one difference is
1285  * that we actually compute the lowest-cost path in pieces, where each piece is
1286  * terminated when there are no choices to be made.
1287  *
1288  * The costs of literals and matches are estimated using the range encoder
1289  * states and the semi-adaptive Huffman codes.  Except for range encoding
1290  * states, costs are assumed to be constant throughout a single run of the
1291  * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data.  This
1292  * introduces a source of non-optimality because the probabilities and Huffman
1293  * codes can change over this part of the data.  And of course, there are
1294  * various other reasons why the result isn't optimal in terms of compression
1295  * ratio.
1296  */
1297 static void
1298 lzms_near_optimal_parse(struct lzms_compressor *c)
1299 {
1300         const u8 *in_next = c->in_buffer;
1301         const u8 * const in_end = &c->in_buffer[c->in_nbytes];
1302         struct lzms_optimum_node *cur_node;
1303         struct lzms_optimum_node *end_node;
1304
1305         /* Set initial length costs for lengths <= MAX_FAST_LENGTH.  */
1306         lzms_update_fast_length_costs(c);
1307
1308         /* Set up the initial adaptive state.  */
1309         lzms_init_adaptive_state(&c->optimum_nodes[0].state);
1310
1311 begin:
1312         /* Start building a new list of items, which will correspond to the next
1313          * piece of the overall minimum-cost path.  */
1314
1315         cur_node = c->optimum_nodes;
1316         cur_node->cost = 0;
1317         end_node = cur_node;
1318
1319         if (in_next == in_end)
1320                 return;
1321
1322         /* The following loop runs once for each per byte in the input buffer,
1323          * except in a few shortcut cases.  */
1324         for (;;) {
1325                 u32 num_matches;
1326
1327                 /* Repeat offset LZ matches  */
1328                 if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS &&
1329                            in_end - in_next >= 2))
1330                 {
1331                         for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) {
1332
1333                                 /* Looking for a repeat offset LZ match at queue
1334                                  * index @rep_idx  */
1335
1336                                 const u32 offset = cur_node->state.recent_lz_offsets[rep_idx];
1337                                 const u8 * const matchptr = in_next - offset;
1338
1339                                 /* Check the first 2 bytes before entering the extension loop.  */
1340                                 if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr))
1341                                         continue;
1342
1343                                 /* Extend the match to its full length.  */
1344                                 const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next);
1345
1346                                 /* Early out for long repeat offset LZ match */
1347                                 if (rep_len >= c->mf.nice_match_len) {
1348
1349                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1350
1351                                         lzms_encode_item_list(c, cur_node);
1352                                         lzms_encode_item(c, rep_len, rep_idx);
1353
1354                                         c->optimum_nodes[0].state = cur_node->state;
1355                                         cur_node = &c->optimum_nodes[0];
1356
1357                                         cur_node->state.upcoming_lz_offset =
1358                                                 cur_node->state.recent_lz_offsets[rep_idx];
1359                                         cur_node->state.upcoming_delta_pair = 0;
1360                                         for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1361                                                 cur_node->state.recent_lz_offsets[i] =
1362                                                         cur_node->state.recent_lz_offsets[i + 1];
1363                                         lzms_update_lru_queues(&cur_node->state);
1364                                         lzms_update_main_state(&cur_node->state, 1);
1365                                         lzms_update_match_state(&cur_node->state, 0);
1366                                         lzms_update_lz_state(&cur_node->state, 1);
1367                                         lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1368                                         goto begin;
1369                                 }
1370
1371                                 while (end_node < cur_node + rep_len)
1372                                         (++end_node)->cost = INFINITE_COST;
1373
1374                                 u32 base_cost = cur_node->cost +
1375                                                 lzms_bit_1_cost(cur_node->state.main_state,
1376                                                                 c->probs.main) +
1377                                                 lzms_bit_0_cost(cur_node->state.match_state,
1378                                                                 c->probs.match) +
1379                                                 lzms_bit_1_cost(cur_node->state.lz_state,
1380                                                                 c->probs.lz);
1381
1382                                 for (int i = 0; i < rep_idx; i++)
1383                                         base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
1384                                                                      c->probs.lz_rep[i]);
1385
1386                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1387                                         base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
1388                                                                      c->probs.lz_rep[rep_idx]);
1389
1390                                 u32 len = 2;
1391                                 do {
1392                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1393                                         if (cost < (cur_node + len)->cost) {
1394                                                 (cur_node + len)->cost = cost;
1395                                                 (cur_node + len)->item = (struct lzms_item) {
1396                                                         .length = len,
1397                                                         .source = rep_idx,
1398                                                 };
1399                                                 (cur_node + len)->num_extra_items = 0;
1400                                         }
1401                                 } while (++len <= rep_len);
1402
1403
1404                                 /* try LZ-rep + lit + LZ-rep0  */
1405                                 if (c->try_lzrep_lit_lzrep0 &&
1406                                     in_end - (in_next + rep_len) >= 3 &&
1407                                     load_u16_unaligned(in_next + rep_len + 1) ==
1408                                     load_u16_unaligned(matchptr + rep_len + 1))
1409                                 {
1410                                         const u32 rep0_len = lz_extend(in_next + rep_len + 1,
1411                                                                        matchptr + rep_len + 1,
1412                                                                        2,
1413                                                                        min(c->mf.nice_match_len,
1414                                                                            in_end - (in_next + rep_len + 1)));
1415
1416                                         unsigned main_state = cur_node->state.main_state;
1417                                         unsigned match_state = cur_node->state.match_state;
1418                                         unsigned lz_state = cur_node->state.lz_state;
1419                                         unsigned lz_rep0_state = cur_node->state.lz_rep_states[0];
1420
1421                                         /* update states for LZ-rep  */
1422                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1423                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1424                                         lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS;
1425                                         lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) %
1426                                                                 LZMS_NUM_LZ_REP_PROBS;
1427
1428                                         /* LZ-rep cost  */
1429                                         u32 cost = base_cost + lzms_fast_length_cost(c, rep_len);
1430
1431                                         /* add literal cost  */
1432                                         cost += lzms_literal_cost(c, main_state, *(in_next + rep_len));
1433
1434                                         /* update state for literal  */
1435                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1436
1437                                         /* add LZ-rep0 cost  */
1438                                         cost += lzms_bit_1_cost(main_state, c->probs.main) +
1439                                                 lzms_bit_0_cost(match_state, c->probs.match) +
1440                                                 lzms_bit_1_cost(lz_state, c->probs.lz) +
1441                                                 lzms_bit_0_cost(lz_rep0_state, c->probs.lz_rep[0]) +
1442                                                 lzms_fast_length_cost(c, rep0_len);
1443
1444                                         const u32 total_len = rep_len + 1 + rep0_len;
1445
1446                                         while (end_node < cur_node + total_len)
1447                                                 (++end_node)->cost = INFINITE_COST;
1448
1449                                         if (cost < (cur_node + total_len)->cost) {
1450                                                 (cur_node + total_len)->cost = cost;
1451                                                 (cur_node + total_len)->item = (struct lzms_item) {
1452                                                         .length = rep0_len,
1453                                                         .source = 0,
1454                                                 };
1455                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1456                                                         .length = 1,
1457                                                         .source = *(in_next + rep_len),
1458                                                 };
1459                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1460                                                         .length = rep_len,
1461                                                         .source = rep_idx,
1462                                                 };
1463                                                 (cur_node + total_len)->num_extra_items = 2;
1464                                         }
1465                                 }
1466                         }
1467                 }
1468
1469                 /* Repeat offset delta matches  */
1470                 if (c->use_delta_matches &&
1471                     likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 &&
1472                            (in_end - in_next >= 2)))
1473                 {
1474                         for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) {
1475
1476                                 /* Looking for a repeat offset delta match at
1477                                  * queue index @rep_idx  */
1478
1479                                 const u32 pair = cur_node->state.recent_delta_pairs[rep_idx];
1480                                 const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT;
1481                                 const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK;
1482                                 const u32 span = (u32)1 << power;
1483                                 const u32 offset = raw_offset << power;
1484                                 const u8 * const matchptr = in_next - offset;
1485
1486                                 /* Check the first 2 bytes before entering the
1487                                  * extension loop.  */
1488                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1489                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1490                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1491                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))))
1492                                         continue;
1493
1494                                 /* Extend the match to its full length.  */
1495                                 const u32 rep_len = lzms_extend_delta_match(in_next, matchptr,
1496                                                                             2, in_end - in_next,
1497                                                                             span);
1498
1499                                 /* Early out for long repeat offset delta match */
1500                                 if (rep_len >= c->mf.nice_match_len) {
1501
1502                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1503
1504                                         lzms_encode_item_list(c, cur_node);
1505                                         lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx);
1506
1507                                         c->optimum_nodes[0].state = cur_node->state;
1508                                         cur_node = &c->optimum_nodes[0];
1509
1510                                         cur_node->state.upcoming_delta_pair = pair;
1511                                         cur_node->state.upcoming_lz_offset = 0;
1512                                         for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1513                                                 cur_node->state.recent_delta_pairs[i] =
1514                                                         cur_node->state.recent_delta_pairs[i + 1];
1515                                         lzms_update_lru_queues(&cur_node->state);
1516                                         lzms_update_main_state(&cur_node->state, 1);
1517                                         lzms_update_match_state(&cur_node->state, 1);
1518                                         lzms_update_delta_state(&cur_node->state, 1);
1519                                         lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1520                                         goto begin;
1521                                 }
1522
1523                                 while (end_node < cur_node + rep_len)
1524                                         (++end_node)->cost = INFINITE_COST;
1525
1526                                 u32 base_cost = cur_node->cost +
1527                                                 lzms_bit_1_cost(cur_node->state.main_state,
1528                                                                 c->probs.main) +
1529                                                 lzms_bit_1_cost(cur_node->state.match_state,
1530                                                                 c->probs.match) +
1531                                                 lzms_bit_1_cost(cur_node->state.delta_state,
1532                                                                 c->probs.delta);
1533
1534                                 for (int i = 0; i < rep_idx; i++)
1535                                         base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
1536                                                                      c->probs.delta_rep[i]);
1537
1538                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1539                                         base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
1540                                                                      c->probs.delta_rep[rep_idx]);
1541
1542                                 u32 len = 2;
1543                                 do {
1544                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1545                                         if (cost < (cur_node + len)->cost) {
1546                                                 (cur_node + len)->cost = cost;
1547                                                 (cur_node + len)->item = (struct lzms_item) {
1548                                                         .length = len,
1549                                                         .source = DELTA_SOURCE_TAG | rep_idx,
1550                                                 };
1551                                                 (cur_node + len)->num_extra_items = 0;
1552                                         }
1553                                 } while (++len <= rep_len);
1554                         }
1555                 }
1556
1557                 /* Explicit offset LZ matches  */
1558                 num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches);
1559                 if (num_matches) {
1560
1561                         u32 best_len = c->matches[0].length;
1562
1563                         /* Early out for long explicit offset LZ match  */
1564                         if (best_len >= c->mf.nice_match_len) {
1565
1566                                 const u32 offset = c->matches[0].offset;
1567
1568                                 /* Extend the match as far as possible.
1569                                  * This is necessary because the LCP-interval
1570                                  * tree matchfinder only reports up to
1571                                  * nice_match_len bytes.  */
1572                                 best_len = lz_extend(in_next, in_next - offset,
1573                                                      best_len, in_end - in_next);
1574
1575                                 in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1);
1576
1577                                 lzms_encode_item_list(c, cur_node);
1578                                 lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1);
1579
1580                                 c->optimum_nodes[0].state = cur_node->state;
1581                                 cur_node = &c->optimum_nodes[0];
1582
1583                                 cur_node->state.upcoming_lz_offset = offset;
1584                                 cur_node->state.upcoming_delta_pair = 0;
1585                                 lzms_update_lru_queues(&cur_node->state);
1586                                 lzms_update_main_state(&cur_node->state, 1);
1587                                 lzms_update_match_state(&cur_node->state, 0);
1588                                 lzms_update_lz_state(&cur_node->state, 0);
1589                                 goto begin;
1590                         }
1591
1592                         while (end_node < cur_node + best_len)
1593                                 (++end_node)->cost = INFINITE_COST;
1594
1595                         u32 base_cost = cur_node->cost +
1596                                         lzms_bit_1_cost(cur_node->state.main_state,
1597                                                         c->probs.main) +
1598                                         lzms_bit_0_cost(cur_node->state.match_state,
1599                                                         c->probs.match) +
1600                                         lzms_bit_0_cost(cur_node->state.lz_state,
1601                                                         c->probs.lz);
1602
1603                         if (c->try_lzmatch_lit_lzrep0 &&
1604                             likely(in_end - (in_next + c->matches[0].length) >= 3))
1605                         {
1606                                 /* try LZ-match + lit + LZ-rep0  */
1607
1608                                 u32 l = 2;
1609                                 u32 i = num_matches - 1;
1610                                 do {
1611                                         const u32 len = c->matches[i].length;
1612                                         const u32 offset = c->matches[i].offset;
1613                                         const u32 position_cost = base_cost +
1614                                                                   lzms_lz_offset_cost(c, offset);
1615                                         do {
1616                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1617                                                 if (cost < (cur_node + l)->cost) {
1618                                                         (cur_node + l)->cost = cost;
1619                                                         (cur_node + l)->item = (struct lzms_item) {
1620                                                                 .length = l,
1621                                                                 .source = offset + (LZMS_NUM_LZ_REPS - 1),
1622                                                         };
1623                                                         (cur_node + l)->num_extra_items = 0;
1624                                                 }
1625                                         } while (++l <= len);
1626
1627                                         const u8 * const matchptr = in_next - offset;
1628                                         if (load_u16_unaligned(matchptr + len + 1) !=
1629                                             load_u16_unaligned(in_next + len + 1))
1630                                                 continue;
1631
1632                                         const u32 rep0_len = lz_extend(in_next + len + 1,
1633                                                                        matchptr + len + 1,
1634                                                                        2,
1635                                                                        min(c->mf.nice_match_len,
1636                                                                            in_end - (in_next + len + 1)));
1637
1638                                         unsigned main_state = cur_node->state.main_state;
1639                                         unsigned match_state = cur_node->state.match_state;
1640                                         unsigned lz_state = cur_node->state.lz_state;
1641
1642                                         /* update states for LZ-match  */
1643                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1644                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1645                                         lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS;
1646
1647                                         /* LZ-match cost  */
1648                                         u32 cost = position_cost + lzms_fast_length_cost(c, len);
1649
1650                                         /* add literal cost  */
1651                                         cost += lzms_literal_cost(c, main_state, *(in_next + len));
1652
1653                                         /* update state for literal  */
1654                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1655
1656                                         /* add LZ-rep0 cost  */
1657                                         cost += lzms_bit_1_cost(main_state, c->probs.main) +
1658                                                 lzms_bit_0_cost(match_state, c->probs.match) +
1659                                                 lzms_bit_1_cost(lz_state, c->probs.lz) +
1660                                                 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1661                                                                 c->probs.lz_rep[0]) +
1662                                                 lzms_fast_length_cost(c, rep0_len);
1663
1664                                         const u32 total_len = len + 1 + rep0_len;
1665
1666                                         while (end_node < cur_node + total_len)
1667                                                 (++end_node)->cost = INFINITE_COST;
1668
1669                                         if (cost < (cur_node + total_len)->cost) {
1670                                                 (cur_node + total_len)->cost = cost;
1671                                                 (cur_node + total_len)->item = (struct lzms_item) {
1672                                                         .length = rep0_len,
1673                                                         .source = 0,
1674                                                 };
1675                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1676                                                         .length = 1,
1677                                                         .source = *(in_next + len),
1678                                                 };
1679                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1680                                                         .length = len,
1681                                                         .source = offset + LZMS_NUM_LZ_REPS - 1,
1682                                                 };
1683                                                 (cur_node + total_len)->num_extra_items = 2;
1684                                         }
1685                                 } while (i--);
1686                         } else {
1687                                 u32 l = 2;
1688                                 u32 i = num_matches - 1;
1689                                 do {
1690                                         u32 position_cost = base_cost +
1691                                                             lzms_lz_offset_cost(c, c->matches[i].offset);
1692                                         do {
1693                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1694                                                 if (cost < (cur_node + l)->cost) {
1695                                                         (cur_node + l)->cost = cost;
1696                                                         (cur_node + l)->item = (struct lzms_item) {
1697                                                                 .length = l,
1698                                                                 .source = c->matches[i].offset +
1699                                                                           (LZMS_NUM_LZ_REPS - 1),
1700                                                         };
1701                                                         (cur_node + l)->num_extra_items = 0;
1702                                                 }
1703                                         } while (++l <= c->matches[i].length);
1704                                 } while (i--);
1705                         }
1706                 }
1707
1708                 /* Explicit offset delta matches  */
1709                 if (c->use_delta_matches &&
1710                     likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1))
1711                 {
1712                         const u32 pos = in_next - c->in_buffer;
1713
1714                         /* Consider each possible power (log2 of span)  */
1715                         STATIC_ASSERT(NUM_POWERS_TO_CONSIDER <= LZMS_NUM_DELTA_POWER_SYMS);
1716                         for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1717
1718                                 const u32 span = (u32)1 << power;
1719
1720                                 if (unlikely(pos < span))
1721                                         continue;
1722
1723                                 const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1724                                 const u32 hash = c->next_delta_hashes[power];
1725                                 const u32 cur_match = c->delta_hash_table[hash];
1726
1727                                 c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1728                                 c->next_delta_hashes[power] = next_hash;
1729                                 prefetchw(&c->delta_hash_table[next_hash]);
1730
1731                                 if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
1732                                         continue;
1733
1734                                 const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK);
1735
1736                                 /* The offset must be a multiple of span.  */
1737                                 if (offset & (span - 1))
1738                                         continue;
1739
1740                                 const u8 * const matchptr = in_next - offset;
1741
1742                                 /* Check the first 3 bytes before entering the
1743                                  * extension loop.  */
1744                                 STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1745                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1746                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1747                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1748                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) ||
1749                                     ((u8)(*(in_next + 2) - *(in_next + 2 - span)) !=
1750                                      (u8)(*(matchptr + 2) - *(matchptr + 2 - span))))
1751                                         continue;
1752
1753                                 /* Extend the delta match to its full length.  */
1754                                 const u32 len = lzms_extend_delta_match(in_next,
1755                                                                         matchptr,
1756                                                                         NBYTES_HASHED_FOR_DELTA,
1757                                                                         in_end - in_next,
1758                                                                         span);
1759
1760                                 const u32 raw_offset = offset >> power;
1761
1762                                 if (unlikely(raw_offset > DELTA_SOURCE_RAW_OFFSET_MASK -
1763                                                           (LZMS_NUM_DELTA_REPS - 1)))
1764                                         continue;
1765
1766                                 const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
1767                                                  raw_offset;
1768                                 const u32 source = DELTA_SOURCE_TAG |
1769                                                    (pair + LZMS_NUM_DELTA_REPS - 1);
1770
1771                                 /* Early out for long explicit offset delta match  */
1772                                 if (len >= c->mf.nice_match_len) {
1773
1774                                         in_next = lzms_skip_bytes(c, len - 1, in_next + 1);
1775
1776                                         lzms_encode_item_list(c, cur_node);
1777                                         lzms_encode_item(c, len, source);
1778
1779                                         c->optimum_nodes[0].state = cur_node->state;
1780                                         cur_node = &c->optimum_nodes[0];
1781
1782                                         cur_node->state.upcoming_lz_offset = 0;
1783                                         cur_node->state.upcoming_delta_pair = pair;
1784                                         lzms_update_lru_queues(&cur_node->state);
1785                                         lzms_update_main_state(&cur_node->state, 1);
1786                                         lzms_update_match_state(&cur_node->state, 1);
1787                                         lzms_update_delta_state(&cur_node->state, 0);
1788                                         goto begin;
1789                                 }
1790
1791                                 while (end_node < cur_node + len)
1792                                         (++end_node)->cost = INFINITE_COST;
1793
1794                                 u32 base_cost = cur_node->cost +
1795                                                 lzms_bit_1_cost(cur_node->state.main_state,
1796                                                                 c->probs.main) +
1797                                                 lzms_bit_1_cost(cur_node->state.match_state,
1798                                                                 c->probs.match) +
1799                                                 lzms_bit_0_cost(cur_node->state.delta_state,
1800                                                                 c->probs.delta) +
1801                                                 lzms_delta_source_cost(c, power, raw_offset);
1802
1803                                 u32 l = NBYTES_HASHED_FOR_DELTA;
1804                                 do {
1805                                         u32 cost = base_cost + lzms_fast_length_cost(c, l);
1806                                         if (cost < (cur_node + l)->cost) {
1807                                                 (cur_node + l)->cost = cost;
1808                                                 (cur_node + l)->item = (struct lzms_item) {
1809                                                         .length = l,
1810                                                         .source = source,
1811                                                 };
1812                                                 (cur_node + l)->num_extra_items = 0;
1813                                         }
1814                                 } while (++l <= len);
1815                         }
1816                 }
1817
1818                 /* Literal  */
1819                 if (end_node < cur_node + 1)
1820                         (++end_node)->cost = INFINITE_COST;
1821                 const u32 cur_and_lit_cost = cur_node->cost +
1822                                              lzms_literal_cost(c, cur_node->state.main_state,
1823                                                                *in_next);
1824                 if (cur_and_lit_cost < (cur_node + 1)->cost) {
1825                         (cur_node + 1)->cost = cur_and_lit_cost;
1826                         (cur_node + 1)->item = (struct lzms_item) {
1827                                 .length = 1,
1828                                 .source = *in_next,
1829                         };
1830                         (cur_node + 1)->num_extra_items = 0;
1831                 } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) {
1832                         /* try lit + LZ-rep0  */
1833                         const u32 offset =
1834                                 (cur_node->state.prev_lz_offset) ?
1835                                         cur_node->state.prev_lz_offset :
1836                                         cur_node->state.recent_lz_offsets[0];
1837
1838                         if (load_u16_unaligned(in_next + 1) ==
1839                             load_u16_unaligned(in_next + 1 - offset))
1840                         {
1841                                 const u32 rep0_len = lz_extend(in_next + 1,
1842                                                                in_next + 1 - offset,
1843                                                                2,
1844                                                                min(in_end - (in_next + 1),
1845                                                                    c->mf.nice_match_len));
1846
1847                                 unsigned main_state = cur_node->state.main_state;
1848
1849                                 /* Update main_state after literal  */
1850                                 main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS;
1851
1852                                 /* Add cost of LZ-rep0  */
1853                                 const u32 cost = cur_and_lit_cost +
1854                                                  lzms_bit_1_cost(main_state, c->probs.main) +
1855                                                  lzms_bit_0_cost(cur_node->state.match_state,
1856                                                                  c->probs.match) +
1857                                                  lzms_bit_1_cost(cur_node->state.lz_state,
1858                                                                  c->probs.lz) +
1859                                                  lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1860                                                                  c->probs.lz_rep[0]) +
1861                                                  lzms_fast_length_cost(c, rep0_len);
1862
1863                                 const u32 total_len = 1 + rep0_len;
1864
1865                                 while (end_node < cur_node + total_len)
1866                                         (++end_node)->cost = INFINITE_COST;
1867
1868                                 if (cost < (cur_node + total_len)->cost) {
1869                                         (cur_node + total_len)->cost = cost;
1870                                         (cur_node + total_len)->item = (struct lzms_item) {
1871                                                 .length = rep0_len,
1872                                                 .source = 0,
1873                                         };
1874                                         (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1875                                                 .length = 1,
1876                                                 .source = *in_next,
1877                                         };
1878                                         (cur_node + total_len)->num_extra_items = 1;
1879                                 }
1880                         }
1881                 }
1882
1883                 /* Advance to the next position.  */
1884                 in_next++;
1885                 cur_node++;
1886
1887                 /* The lowest-cost path to the current position is now known.
1888                  * Finalize the adaptive state that results from taking this
1889                  * lowest-cost path.  */
1890                 struct lzms_item item_to_take = cur_node->item;
1891                 struct lzms_optimum_node *source_node = cur_node - item_to_take.length;
1892                 int next_item_idx = -1;
1893                 for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
1894                         item_to_take = cur_node->extra_items[i];
1895                         source_node -= item_to_take.length;
1896                         next_item_idx++;
1897                 }
1898                 cur_node->state = source_node->state;
1899                 for (;;) {
1900                         const u32 length = item_to_take.length;
1901                         u32 source = item_to_take.source;
1902
1903                         cur_node->state.upcoming_lz_offset = 0;
1904                         cur_node->state.upcoming_delta_pair = 0;
1905                         if (length > 1) {
1906                                 /* Match  */
1907
1908                                 lzms_update_main_state(&cur_node->state, 1);
1909
1910                                 if (source & DELTA_SOURCE_TAG) {
1911                                         /* Delta match  */
1912
1913                                         lzms_update_match_state(&cur_node->state, 1);
1914                                         source &= ~DELTA_SOURCE_TAG;
1915
1916                                         if (source >= LZMS_NUM_DELTA_REPS) {
1917                                                 /* Explicit offset delta match  */
1918                                                 lzms_update_delta_state(&cur_node->state, 0);
1919                                                 cur_node->state.upcoming_delta_pair =
1920                                                         source - (LZMS_NUM_DELTA_REPS - 1);
1921                                         } else {
1922                                                 /* Repeat offset delta match  */
1923                                                 int rep_idx = source;
1924
1925                                                 lzms_update_delta_state(&cur_node->state, 1);
1926                                                 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1927
1928                                                 cur_node->state.upcoming_delta_pair =
1929                                                         cur_node->state.recent_delta_pairs[rep_idx];
1930
1931                                                 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1932                                                         cur_node->state.recent_delta_pairs[i] =
1933                                                                 cur_node->state.recent_delta_pairs[i + 1];
1934                                         }
1935                                 } else {
1936                                         lzms_update_match_state(&cur_node->state, 0);
1937
1938                                         if (source >= LZMS_NUM_LZ_REPS) {
1939                                                 /* Explicit offset LZ match  */
1940                                                 lzms_update_lz_state(&cur_node->state, 0);
1941                                                 cur_node->state.upcoming_lz_offset =
1942                                                         source - (LZMS_NUM_LZ_REPS - 1);
1943                                         } else {
1944                                                 /* Repeat offset LZ match  */
1945                                                 int rep_idx = source;
1946
1947                                                 lzms_update_lz_state(&cur_node->state, 1);
1948                                                 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1949
1950                                                 cur_node->state.upcoming_lz_offset =
1951                                                         cur_node->state.recent_lz_offsets[rep_idx];
1952
1953                                                 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1954                                                         cur_node->state.recent_lz_offsets[i] =
1955                                                                 cur_node->state.recent_lz_offsets[i + 1];
1956                                         }
1957                                 }
1958                         } else {
1959                                 /* Literal  */
1960                                 lzms_update_main_state(&cur_node->state, 0);
1961                         }
1962
1963                         lzms_update_lru_queues(&cur_node->state);
1964
1965                         if (next_item_idx < 0)
1966                                 break;
1967                         if (next_item_idx == 0)
1968                                 item_to_take = cur_node->item;
1969                         else
1970                                 item_to_take = cur_node->extra_items[next_item_idx - 1];
1971                         --next_item_idx;
1972                 }
1973
1974                 /*
1975                  * This loop will terminate when either of the following
1976                  * conditions is true:
1977                  *
1978                  * (1) cur_node == end_node
1979                  *
1980                  *      There are no paths that extend beyond the current
1981                  *      position.  In this case, any path to a later position
1982                  *      must pass through the current position, so we can go
1983                  *      ahead and choose the list of items that led to this
1984                  *      position.
1985                  *
1986                  * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]
1987                  *
1988                  *      This bounds the number of times the algorithm can step
1989                  *      forward before it is guaranteed to start choosing items.
1990                  *      This limits the memory usage.  It also guarantees that
1991                  *      the parser will not go too long without updating the
1992                  *      probability tables.
1993                  *
1994                  * Note: no check for end-of-buffer is needed because
1995                  * end-of-buffer will trigger condition (1).
1996                  */
1997                 if (cur_node == end_node ||
1998                     cur_node == &c->optimum_nodes[NUM_OPTIM_NODES])
1999                 {
2000                         lzms_encode_nonempty_item_list(c, cur_node);
2001                         c->optimum_nodes[0].state = cur_node->state;
2002                         goto begin;
2003                 }
2004         }
2005 }
2006
2007 static void
2008 lzms_init_states_and_probabilities(struct lzms_compressor *c)
2009 {
2010         c->main_state = 0;
2011         c->match_state = 0;
2012         c->lz_state = 0;
2013         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2014                 c->lz_rep_states[i] = 0;
2015         c->delta_state = 0;
2016         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2017                 c->delta_rep_states[i] = 0;
2018
2019         lzms_init_probabilities(&c->probs);
2020 }
2021
2022 static void
2023 lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots)
2024 {
2025         lzms_init_huffman_code(&c->literal_rebuild_info,
2026                                LZMS_NUM_LITERAL_SYMS,
2027                                LZMS_LITERAL_CODE_REBUILD_FREQ,
2028                                c->literal_codewords,
2029                                c->literal_lens,
2030                                c->literal_freqs);
2031
2032         lzms_init_huffman_code(&c->lz_offset_rebuild_info,
2033                                num_offset_slots,
2034                                LZMS_LZ_OFFSET_CODE_REBUILD_FREQ,
2035                                c->lz_offset_codewords,
2036                                c->lz_offset_lens,
2037                                c->lz_offset_freqs);
2038
2039         lzms_init_huffman_code(&c->length_rebuild_info,
2040                                LZMS_NUM_LENGTH_SYMS,
2041                                LZMS_LENGTH_CODE_REBUILD_FREQ,
2042                                c->length_codewords,
2043                                c->length_lens,
2044                                c->length_freqs);
2045
2046         lzms_init_huffman_code(&c->delta_offset_rebuild_info,
2047                                num_offset_slots,
2048                                LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ,
2049                                c->delta_offset_codewords,
2050                                c->delta_offset_lens,
2051                                c->delta_offset_freqs);
2052
2053         lzms_init_huffman_code(&c->delta_power_rebuild_info,
2054                                LZMS_NUM_DELTA_POWER_SYMS,
2055                                LZMS_DELTA_POWER_CODE_REBUILD_FREQ,
2056                                c->delta_power_codewords,
2057                                c->delta_power_lens,
2058                                c->delta_power_freqs);
2059 }
2060
2061 /*
2062  * Flush the output streams, prepare the final compressed data, and return its
2063  * size in bytes.
2064  *
2065  * A return value of 0 indicates that the data could not be compressed to fit in
2066  * the available space.
2067  */
2068 static size_t
2069 lzms_finalize(struct lzms_compressor *c)
2070 {
2071         size_t num_forwards_units;
2072         size_t num_backwards_units;
2073
2074         /* Flush both the forwards and backwards streams, and make sure they
2075          * didn't cross each other and start overwriting each other's data.  */
2076         if (!lzms_output_bitstream_flush(&c->os))
2077                 return 0;
2078
2079         if (!lzms_range_encoder_flush(&c->rc))
2080                 return 0;
2081
2082         if (c->rc.next > c->os.next)
2083                 return 0;
2084
2085         /* Now the compressed buffer contains the data output by the forwards
2086          * bitstream, then empty space, then data output by the backwards
2087          * bitstream.  Move the data output by the backwards bitstream to be
2088          * adjacent to the data output by the forward bitstream, and calculate
2089          * the compressed size that this results in.  */
2090         num_forwards_units = c->rc.next - c->rc.begin;
2091         num_backwards_units = c->rc.end - c->os.next;
2092
2093         memmove(c->rc.next, c->os.next, num_backwards_units * sizeof(le16));
2094
2095         return (num_forwards_units + num_backwards_units) * sizeof(le16);
2096 }
2097
2098 static u64
2099 lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2100                        bool destructive)
2101 {
2102         u64 size = 0;
2103
2104         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2105                 return 0;
2106
2107         size += sizeof(struct lzms_compressor);
2108
2109         if (!destructive)
2110                 size += max_bufsize; /* in_buffer */
2111
2112         /* mf */
2113         size += lcpit_matchfinder_get_needed_memory(max_bufsize);
2114
2115         return size;
2116 }
2117
2118 static int
2119 lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
2120                        bool destructive, void **c_ret)
2121 {
2122         struct lzms_compressor *c;
2123         u32 nice_match_len;
2124
2125         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2126                 return WIMLIB_ERR_INVALID_PARAM;
2127
2128         c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64);
2129         if (!c)
2130                 goto oom0;
2131
2132         c->destructive = destructive;
2133
2134         /* Scale nice_match_len with the compression level.  But to allow an
2135          * optimization for length cost calculations, don't allow nice_match_len
2136          * to exceed MAX_FAST_LENGTH.  */
2137         nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH);
2138
2139         c->use_delta_matches = (compression_level >= 35);
2140         c->try_lzmatch_lit_lzrep0 = (compression_level >= 45);
2141         c->try_lit_lzrep0 = (compression_level >= 60);
2142         c->try_lzrep_lit_lzrep0 = (compression_level >= 60);
2143
2144         if (!c->destructive) {
2145                 c->in_buffer = MALLOC(max_bufsize);
2146                 if (!c->in_buffer)
2147                         goto oom1;
2148         }
2149
2150         if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len))
2151                 goto oom2;
2152
2153         lzms_init_fast_length_slot_tab(c);
2154         lzms_init_offset_slot_tabs(c);
2155
2156         *c_ret = c;
2157         return 0;
2158
2159 oom2:
2160         if (!c->destructive)
2161                 FREE(c->in_buffer);
2162 oom1:
2163         ALIGNED_FREE(c);
2164 oom0:
2165         return WIMLIB_ERR_NOMEM;
2166 }
2167
2168 static size_t
2169 lzms_compress(const void *restrict in, size_t in_nbytes,
2170               void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2171 {
2172         struct lzms_compressor *c = _c;
2173         size_t result;
2174
2175         /* Don't bother trying to compress extremely small inputs.  */
2176         if (in_nbytes < 4)
2177                 return 0;
2178
2179         /* Copy the input data into the internal buffer and preprocess it.  */
2180         if (c->destructive)
2181                 c->in_buffer = (void *)in;
2182         else
2183                 memcpy(c->in_buffer, in, in_nbytes);
2184         c->in_nbytes = in_nbytes;
2185         lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false);
2186
2187         /* Prepare the matchfinders.  */
2188         lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes);
2189         if (c->use_delta_matches)
2190                 lzms_init_delta_matchfinder(c);
2191
2192         /* Initialize the encoder structures.  */
2193         lzms_range_encoder_init(&c->rc, out, out_nbytes_avail / sizeof(le16));
2194         lzms_output_bitstream_init(&c->os, out, out_nbytes_avail / sizeof(le16));
2195         lzms_init_states_and_probabilities(c);
2196         lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes));
2197
2198         /* The main loop: parse and encode.  */
2199         lzms_near_optimal_parse(c);
2200
2201         /* Return the compressed data size or 0.  */
2202         result = lzms_finalize(c);
2203         if (!result && c->destructive)
2204                 lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true);
2205         return result;
2206 }
2207
2208 static void
2209 lzms_free_compressor(void *_c)
2210 {
2211         struct lzms_compressor *c = _c;
2212
2213         if (!c->destructive)
2214                 FREE(c->in_buffer);
2215         lcpit_matchfinder_destroy(&c->mf);
2216         ALIGNED_FREE(c);
2217 }
2218
2219 const struct compressor_ops lzms_compressor_ops = {
2220         .get_needed_memory  = lzms_get_needed_memory,
2221         .create_compressor  = lzms_create_compressor,
2222         .compress           = lzms_compress,
2223         .free_compressor    = lzms_free_compressor,
2224 };