caa93130c75ca44f656a1d8464621985d6eb3bd1
[wimlib] / src / lzms_compress.c
1 /*
2  * lzms_compress.c
3  *
4  * A compressor for the LZMS compression format.
5  */
6
7 /*
8  * Copyright (C) 2013, 2014, 2015 Eric Biggers
9  *
10  * This file is free software; you can redistribute it and/or modify it under
11  * the terms of the GNU Lesser General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option) any
13  * later version.
14  *
15  * This file is distributed in the hope that it will be useful, but WITHOUT
16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this file; if not, see http://www.gnu.org/licenses/.
22  */
23
24 #ifdef HAVE_CONFIG_H
25 #  include "config.h"
26 #endif
27
28 #include <limits.h>
29 #include <string.h>
30
31 #include "wimlib/compress_common.h"
32 #include "wimlib/compressor_ops.h"
33 #include "wimlib/error.h"
34 #include "wimlib/lcpit_matchfinder.h"
35 #include "wimlib/lz_extend.h"
36 #include "wimlib/lz_hash.h"
37 #include "wimlib/lzms_common.h"
38 #include "wimlib/unaligned.h"
39 #include "wimlib/util.h"
40
41 /*
42  * MAX_FAST_LENGTH is the maximum match length for which the length slot can be
43  * looked up directly in 'fast_length_slot_tab' and the length cost can be
44  * looked up directly in 'fast_length_cost_tab'.
45  *
46  * We also limit the 'nice_match_len' parameter to this value.  Consequently, if
47  * the longest match found is shorter than 'nice_match_len', then it must also
48  * be shorter than MAX_FAST_LENGTH.  This makes it possible to do fast lookups
49  * of length costs using 'fast_length_cost_tab' without having to keep checking
50  * whether the length exceeds MAX_FAST_LENGTH or not.
51  */
52 #define MAX_FAST_LENGTH         255
53
54 /* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will
55  * step forward before forcing the pending items to be encoded.  If this value
56  * is increased, then there will be fewer forced flushes, but the probability
57  * entries and Huffman codes will be more likely to become outdated.  */
58 #define NUM_OPTIM_NODES         2048
59
60 /* COST_SHIFT is a scaling factor that makes it possible to consider fractional
61  * bit costs.  A single bit has a cost of (1 << COST_SHIFT).  */
62 #define COST_SHIFT              6
63
64 /* Length of the hash table for finding delta matches  */
65 #define DELTA_HASH_ORDER        17
66 #define DELTA_HASH_LENGTH       ((u32)1 << DELTA_HASH_ORDER)
67
68 /* The number of bytes to hash when finding delta matches; also taken to be the
69  * minimum length of an explicit offset delta match  */
70 #define NBYTES_HASHED_FOR_DELTA 3
71
72 /* The number of delta match powers to consider (must be <=
73  * LZMS_NUM_DELTA_POWER_SYMS)  */
74 #define NUM_POWERS_TO_CONSIDER  6
75
76 /* This structure tracks the state of writing bits as a series of 16-bit coding
77  * units, starting at the end of the output buffer and proceeding backwards.  */
78 struct lzms_output_bitstream {
79
80         /* Bits that haven't yet been written to the output buffer  */
81         u64 bitbuf;
82
83         /* Number of bits currently held in @bitbuf  */
84         unsigned bitcount;
85
86         /* Pointer to one past the next position in the output buffer at which
87          * to output a 16-bit coding unit  */
88         le16 *next;
89
90         /* Pointer to the beginning of the output buffer (this is the "end" when
91          * writing backwards!)  */
92         le16 *begin;
93 };
94
95 /* This structure tracks the state of range encoding and its output, which
96  * starts at the beginning of the output buffer and proceeds forwards.  */
97 struct lzms_range_encoder {
98
99         /* The lower boundary of the current range.  Logically, this is a 33-bit
100          * integer whose high bit is needed to detect carries.  */
101         u64 lower_bound;
102
103         /* The size of the current range  */
104         u32 range_size;
105
106         /* The next 16-bit coding unit to output  */
107         u16 cache;
108
109         /* The number of 16-bit coding units whose output has been delayed due
110          * to possible carrying.  The first such coding unit is @cache; all
111          * subsequent such coding units are 0xffff.  */
112         u32 cache_size;
113
114         /* Pointer to the beginning of the output buffer  */
115         le16 *begin;
116
117         /* Pointer to the position in the output buffer at which the next coding
118          * unit must be written  */
119         le16 *next;
120
121         /* Pointer to just past the end of the output buffer  */
122         le16 *end;
123 };
124
125 /* Bookkeeping information for an adaptive Huffman code  */
126 struct lzms_huffman_rebuild_info {
127
128         /* The remaining number of symbols to encode until this code must be
129          * rebuilt  */
130         unsigned num_syms_until_rebuild;
131
132         /* The number of symbols in this code  */
133         unsigned num_syms;
134
135         /* The rebuild frequency of this code, in symbols  */
136         unsigned rebuild_freq;
137
138         /* The Huffman codeword of each symbol in this code  */
139         u32 *codewords;
140
141         /* The length of each Huffman codeword, in bits  */
142         u8 *lens;
143
144         /* The frequency of each symbol in this code  */
145         u32 *freqs;
146 };
147
148 /*
149  * The compressor-internal representation of a match or literal.
150  *
151  * Literals have length=1; matches have length > 1.  (We disallow matches of
152  * length 1, even though this is a valid length in LZMS.)
153  *
154  * The source is encoded as follows:
155  *
156  * - Literals: the literal byte itself
157  * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1)
158  * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets
159  * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are
160  *   the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1)
161  * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is
162  *   the index of the (power, raw_offset) pair in recent_delta_pairs
163  */
164 struct lzms_item {
165         u32 length;
166         u32 source;
167 };
168
169 #define DELTA_SOURCE_TAG                ((u32)1 << 31)
170 #define DELTA_SOURCE_POWER_SHIFT        28
171 #define DELTA_SOURCE_RAW_OFFSET_MASK    (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1)
172
173 static inline void
174 check_that_powers_fit_in_bitfield(void)
175 {
176         BUILD_BUG_ON(LZMS_NUM_DELTA_POWER_SYMS > (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
177 }
178
179 /* A stripped-down version of the adaptive state in LZMS which excludes the
180  * probability entries and Huffman codes  */
181 struct lzms_adaptive_state {
182
183         /* Recent offsets for LZ matches  */
184         u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1];
185         u32 prev_lz_offset; /* 0 means none */
186         u32 upcoming_lz_offset; /* 0 means none */
187
188         /* Recent (power, raw offset) pairs for delta matches.
189          * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw
190          * offset, and the high bits are the power.  */
191         u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1];
192         u32 prev_delta_pair; /* 0 means none */
193         u32 upcoming_delta_pair; /* 0 means none  */
194
195         /* States for predicting the probabilities of item types  */
196         u8 main_state;
197         u8 match_state;
198         u8 lz_state;
199         u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
200         u8 delta_state;
201         u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
202 } _aligned_attribute(64);
203
204 /*
205  * This structure represents a byte position in the preprocessed input data and
206  * a node in the graph of possible match/literal choices.
207  *
208  * Logically, each incoming edge to this node is labeled with a literal or a
209  * match that can be taken to reach this position from an earlier position; and
210  * each outgoing edge from this node is labeled with a literal or a match that
211  * can be taken to advance from this position to a later position.
212  */
213 struct lzms_optimum_node {
214
215         /*
216          * The cost of the lowest-cost path that has been found to reach this
217          * position.  This can change as progressively lower cost paths are
218          * found to reach this position.
219          */
220         u32 cost;
221 #define INFINITE_COST UINT32_MAX
222
223         /*
224          * @item is the last item that was taken to reach this position to reach
225          * it with the stored @cost.  This can change as progressively lower
226          * cost paths are found to reach this position.
227          *
228          * In some cases we look ahead more than one item.  If we looked ahead n
229          * items to reach this position, then @item is the last item taken,
230          * @extra_items contains the other items ordered from second-to-last to
231          * first, and @num_extra_items is n - 1.
232          */
233         unsigned num_extra_items;
234         struct lzms_item item;
235         struct lzms_item extra_items[2];
236
237         /*
238          * The adaptive state that exists at this position.  This is filled in
239          * lazily, only after the minimum-cost path to this position is found.
240          *
241          * Note: the way the algorithm handles this adaptive state in the
242          * "minimum-cost" parse is actually only an approximation.  It's
243          * possible for the globally optimal, minimum cost path to contain a
244          * prefix, ending at a position, where that path prefix is *not* the
245          * minimum cost path to that position.  This can happen if such a path
246          * prefix results in a different adaptive state which results in lower
247          * costs later.  Although the algorithm does do some heuristic
248          * multi-item lookaheads, it does not solve this problem in general.
249          *
250          * Note: this adaptive state structure also does not include the
251          * probability entries or current Huffman codewords.  Those aren't
252          * maintained per-position and are only updated occassionally.
253          */
254         struct lzms_adaptive_state state;
255 } _aligned_attribute(64);
256
257 /* The main compressor structure  */
258 struct lzms_compressor {
259
260         /* The matchfinder for LZ matches  */
261         struct lcpit_matchfinder mf;
262
263         /* The preprocessed buffer of data being compressed  */
264         u8 *in_buffer;
265
266         /* The number of bytes of data to be compressed, which is the number of
267          * bytes of data in @in_buffer that are actually valid  */
268         size_t in_nbytes;
269
270         /*
271          * Boolean flags to enable consideration of various types of multi-step
272          * operations during parsing.
273          *
274          * Among other cases, multi-step operations can help with gaps where two
275          * matches are separated by a non-matching byte.
276          *
277          * This idea is borrowed from Igor Pavlov's LZMA encoder.
278          */
279         bool try_lit_lzrep0;
280         bool try_lzrep_lit_lzrep0;
281         bool try_lzmatch_lit_lzrep0;
282
283         /*
284          * If true, the compressor can use delta matches.  This slows down
285          * compression.  It improves the compression ratio greatly, slightly, or
286          * not at all, depending on the input data.
287          */
288         bool use_delta_matches;
289
290         /* If true, the compressor need not preserve the input buffer if it
291          * compresses the data successfully.  */
292         bool destructive;
293
294         /* 'last_target_usages' is a large array that is only needed for
295          * preprocessing, so it is in union with fields that don't need to be
296          * initialized until after preprocessing.  */
297         union {
298         struct {
299
300         /* Temporary space to store matches found by the LZ matchfinder  */
301         struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1];
302
303         /* Hash table for finding delta matches  */
304         u32 delta_hash_table[DELTA_HASH_LENGTH];
305
306         /* For each delta power, the hash code for the next sequence  */
307         u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
308
309         /* The per-byte graph nodes for near-optimal parsing  */
310         struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH];
311
312         /* Table: length => current cost for small match lengths  */
313         u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
314
315         /* Range encoder which outputs to the beginning of the compressed data
316          * buffer, proceeding forwards  */
317         struct lzms_range_encoder rc;
318
319         /* Bitstream which outputs to the end of the compressed data buffer,
320          * proceeding backwards  */
321         struct lzms_output_bitstream os;
322
323         /* States and probability entries for item type disambiguation  */
324         unsigned main_state;
325         unsigned match_state;
326         unsigned lz_state;
327         unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
328         unsigned delta_state;
329         unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
330         struct lzms_probability_entry main_probs[LZMS_NUM_MAIN_PROBS];
331         struct lzms_probability_entry match_probs[LZMS_NUM_MATCH_PROBS];
332         struct lzms_probability_entry lz_probs[LZMS_NUM_LZ_PROBS];
333         struct lzms_probability_entry lz_rep_probs[LZMS_NUM_LZ_REP_DECISIONS]
334                                                   [LZMS_NUM_LZ_REP_PROBS];
335         struct lzms_probability_entry delta_probs[LZMS_NUM_DELTA_PROBS];
336         struct lzms_probability_entry delta_rep_probs[LZMS_NUM_DELTA_REP_DECISIONS]
337                                                      [LZMS_NUM_DELTA_REP_PROBS];
338
339         /* Huffman codes  */
340
341         struct lzms_huffman_rebuild_info literal_rebuild_info;
342         u32 literal_codewords[LZMS_NUM_LITERAL_SYMS];
343         u8 literal_lens[LZMS_NUM_LITERAL_SYMS];
344         u32 literal_freqs[LZMS_NUM_LITERAL_SYMS];
345
346         struct lzms_huffman_rebuild_info lz_offset_rebuild_info;
347         u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
348         u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
349         u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
350
351         struct lzms_huffman_rebuild_info length_rebuild_info;
352         u32 length_codewords[LZMS_NUM_LENGTH_SYMS];
353         u8 length_lens[LZMS_NUM_LENGTH_SYMS];
354         u32 length_freqs[LZMS_NUM_LENGTH_SYMS];
355
356         struct lzms_huffman_rebuild_info delta_offset_rebuild_info;
357         u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
358         u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
359         u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
360
361         struct lzms_huffman_rebuild_info delta_power_rebuild_info;
362         u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS];
363         u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS];
364         u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS];
365
366         }; /* struct */
367
368         s32 last_target_usages[65536];
369
370         }; /* union */
371
372         /* Table: length => length slot for small match lengths  */
373         u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1];
374
375         /* Tables for mapping offsets to offset slots  */
376
377         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
378         u8 offset_slot_tab_1[0xe4a5];
379
380         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
381         u16 offset_slot_tab_2[0x3d0000 >> 11];
382
383         /* slots [427, 799); 16 <= num_extra_bits  */
384         u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16];
385 };
386
387 /******************************************************************************
388  *                   Offset and length slot acceleration                      *
389  ******************************************************************************/
390
391 /* Generate the acceleration table for length slots.  */
392 static void
393 lzms_init_fast_length_slot_tab(struct lzms_compressor *c)
394 {
395         unsigned slot = 0;
396         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
397                 if (len >= lzms_length_slot_base[slot + 1])
398                         slot++;
399                 c->fast_length_slot_tab[len] = slot;
400         }
401 }
402
403 /* Generate the acceleration tables for offset slots.  */
404 static void
405 lzms_init_offset_slot_tabs(struct lzms_compressor *c)
406 {
407         u32 offset;
408         unsigned slot = 0;
409
410         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
411         for (offset = 1; offset < 0xe4a5; offset++) {
412                 if (offset >= lzms_offset_slot_base[slot + 1])
413                         slot++;
414                 c->offset_slot_tab_1[offset] = slot;
415         }
416
417         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
418         for (; offset < 0x3de4a5; offset += (u32)1 << 11) {
419                 if (offset >= lzms_offset_slot_base[slot + 1])
420                         slot++;
421                 c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot;
422         }
423
424         /* slots [427, 799); 16 <= num_extra_bits  */
425         for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) {
426                 if (offset >= lzms_offset_slot_base[slot + 1])
427                         slot++;
428                 c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot;
429         }
430 }
431
432 /*
433  * Return the length slot for the specified match length, using the compressor's
434  * acceleration table if the length is small enough.
435  */
436 static inline unsigned
437 lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
438 {
439         if (likely(length <= MAX_FAST_LENGTH))
440                 return c->fast_length_slot_tab[length];
441         return lzms_get_length_slot(length);
442 }
443
444 /*
445  * Return the offset slot for the specified match offset, using the compressor's
446  * acceleration tables to speed up the mapping.
447  */
448 static inline unsigned
449 lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
450 {
451         if (offset < 0xe4a5)
452                 return c->offset_slot_tab_1[offset];
453         offset -= 0xe4a5;
454         if (offset < 0x3d0000)
455                 return c->offset_slot_tab_2[offset >> 11];
456         return c->offset_slot_tab_3[offset >> 16];
457 }
458
459 /******************************************************************************
460  *                             Range encoding                                 *
461  ******************************************************************************/
462
463 /*
464  * Initialize the range encoder @rc to write forwards to the specified buffer
465  * @out that is @count 16-bit integers long.
466  */
467 static void
468 lzms_range_encoder_init(struct lzms_range_encoder *rc, le16 *out, size_t count)
469 {
470         rc->lower_bound = 0;
471         rc->range_size = 0xffffffff;
472         rc->cache = 0;
473         rc->cache_size = 1;
474         rc->begin = out;
475         rc->next = out - 1;
476         rc->end = out + count;
477 }
478
479 /*
480  * Attempt to flush bits from the range encoder.
481  *
482  * The basic idea is that we're writing bits from @rc->lower_bound to the
483  * output.  However, due to carrying, the writing of coding units with the
484  * maximum value, as well as one prior coding unit, must be delayed until it is
485  * determined whether a carry is needed.
486  *
487  * This is based on the public domain code for LZMA written by Igor Pavlov, but
488  * with the following differences:
489  *
490  *      - In LZMS, 16-bit coding units are required rather than 8-bit.
491  *
492  *      - In LZMS, the first coding unit is not ignored by the decompressor, so
493  *        the encoder cannot output a dummy value to that position.
494  */
495 static void
496 lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
497 {
498         if ((u32)(rc->lower_bound) < 0xffff0000 ||
499             (u32)(rc->lower_bound >> 32) != 0)
500         {
501                 /* Carry not needed (rc->lower_bound < 0xffff0000), or carry
502                  * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit
503                  * is 1).  */
504                 do {
505                         if (likely(rc->next >= rc->begin)) {
506                                 if (rc->next != rc->end) {
507                                         put_unaligned_u16_le(rc->cache +
508                                                              (u16)(rc->lower_bound >> 32),
509                                                              rc->next++);
510                                 }
511                         } else {
512                                 rc->next++;
513                         }
514                         rc->cache = 0xffff;
515                 } while (--rc->cache_size != 0);
516
517                 rc->cache = (rc->lower_bound >> 16) & 0xffff;
518         }
519         ++rc->cache_size;
520         rc->lower_bound = (rc->lower_bound & 0xffff) << 16;
521 }
522
523 static bool
524 lzms_range_encoder_flush(struct lzms_range_encoder *rc)
525 {
526         for (int i = 0; i < 4; i++)
527                 lzms_range_encoder_shift_low(rc);
528         return rc->next != rc->end;
529 }
530
531 /*
532  * Encode the next bit using the range encoder.
533  *
534  * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
535  * bit is 0 rather than 1.
536  */
537 static inline void
538 lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
539 {
540         /* Normalize if needed.  */
541         if (rc->range_size <= 0xffff) {
542                 rc->range_size <<= 16;
543                 lzms_range_encoder_shift_low(rc);
544         }
545
546         u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob;
547         if (bit == 0) {
548                 rc->range_size = bound;
549         } else {
550                 rc->lower_bound += bound;
551                 rc->range_size -= bound;
552         }
553 }
554
555 /*
556  * Encode a bit.  This wraps around lzms_range_encode_bit() to handle using and
557  * updating the state and its corresponding probability entry.
558  */
559 static inline void
560 lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
561                 struct lzms_probability_entry *probs,
562                 struct lzms_range_encoder *rc)
563 {
564         struct lzms_probability_entry *prob_entry;
565         u32 prob;
566
567         /* Load the probability entry for the current state.  */
568         prob_entry = &probs[*state_p];
569
570         /* Update the state based on the next bit.  */
571         *state_p = ((*state_p << 1) | bit) & (num_states - 1);
572
573         /* Get the probability that the bit is 0.  */
574         prob = lzms_get_probability(prob_entry);
575
576         /* Update the probability entry.  */
577         lzms_update_probability_entry(prob_entry, bit);
578
579         /* Encode the bit using the range encoder.  */
580         lzms_range_encode_bit(rc, bit, prob);
581 }
582
583 /* Helper functions for encoding bits in the various decision classes  */
584
585 static void
586 lzms_encode_main_bit(struct lzms_compressor *c, int bit)
587 {
588         lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
589                         c->main_probs, &c->rc);
590 }
591
592 static void
593 lzms_encode_match_bit(struct lzms_compressor *c, int bit)
594 {
595         lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
596                         c->match_probs, &c->rc);
597 }
598
599 static void
600 lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
601 {
602         lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
603                         c->lz_probs, &c->rc);
604 }
605
606 static void
607 lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
608 {
609         lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
610                         c->lz_rep_probs[idx], &c->rc);
611 }
612
613 static void
614 lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
615 {
616         lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
617                         c->delta_probs, &c->rc);
618 }
619
620 static void
621 lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
622 {
623         lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
624                         c->delta_rep_probs[idx], &c->rc);
625 }
626
627 /******************************************************************************
628  *                   Huffman encoding and verbatim bits                       *
629  ******************************************************************************/
630
631 /*
632  * Initialize the output bitstream @os to write backwards to the specified
633  * buffer @out that is @count 16-bit integers long.
634  */
635 static void
636 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
637                            le16 *out, size_t count)
638 {
639         os->bitbuf = 0;
640         os->bitcount = 0;
641         os->next = out + count;
642         os->begin = out;
643 }
644
645 /*
646  * Write some bits, contained in the low-order @num_bits bits of @bits, to the
647  * output bitstream @os.
648  *
649  * @max_num_bits is a compile-time constant that specifies the maximum number of
650  * bits that can ever be written at this call site.
651  */
652 static inline void
653 lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
654                 const unsigned num_bits, const unsigned max_num_bits)
655 {
656         /* Add the bits to the bit buffer variable.  */
657         os->bitcount += num_bits;
658         os->bitbuf = (os->bitbuf << num_bits) | bits;
659
660         /* Check whether any coding units need to be written.  */
661         while (os->bitcount >= 16) {
662
663                 os->bitcount -= 16;
664
665                 /* Write a coding unit, unless it would underflow the buffer. */
666                 if (os->next != os->begin)
667                         put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next);
668
669                 /* Optimization for call sites that never write more than 16
670                  * bits at once.  */
671                 if (max_num_bits <= 16)
672                         break;
673         }
674 }
675
676 /*
677  * Flush the output bitstream, ensuring that all bits written to it have been
678  * written to memory.  Return %true if all bits have been output successfully,
679  * or %false if an overrun occurred.
680  */
681 static bool
682 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
683 {
684         if (os->next == os->begin)
685                 return false;
686
687         if (os->bitcount != 0)
688                 put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next);
689
690         return true;
691 }
692
693 static void
694 lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
695 {
696         make_canonical_huffman_code(rebuild_info->num_syms,
697                                     LZMS_MAX_CODEWORD_LENGTH,
698                                     rebuild_info->freqs,
699                                     rebuild_info->lens,
700                                     rebuild_info->codewords);
701         rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq;
702 }
703
704 static void
705 lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info,
706                        unsigned num_syms, unsigned rebuild_freq,
707                        u32 *codewords, u8 *lens, u32 *freqs)
708 {
709         rebuild_info->num_syms = num_syms;
710         rebuild_info->rebuild_freq = rebuild_freq;
711         rebuild_info->codewords = codewords;
712         rebuild_info->lens = lens;
713         rebuild_info->freqs = freqs;
714         lzms_init_symbol_frequencies(freqs, num_syms);
715         lzms_build_huffman_code(rebuild_info);
716 }
717
718 static void
719 lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
720 {
721         lzms_build_huffman_code(rebuild_info);
722         lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms);
723 }
724
725 /*
726  * Encode a symbol using the specified Huffman code.  Then, if the Huffman code
727  * needs to be rebuilt, rebuild it and return true; otherwise return false.
728  */
729 static inline bool
730 lzms_huffman_encode_symbol(unsigned sym,
731                            const u32 *codewords, const u8 *lens, u32 *freqs,
732                            struct lzms_output_bitstream *os,
733                            struct lzms_huffman_rebuild_info *rebuild_info)
734 {
735         lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH);
736         ++freqs[sym];
737         if (--rebuild_info->num_syms_until_rebuild == 0) {
738                 lzms_rebuild_huffman_code(rebuild_info);
739                 return true;
740         }
741         return false;
742 }
743
744 /* Helper routines to encode symbols using the various Huffman codes  */
745
746 static bool
747 lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym)
748 {
749         return lzms_huffman_encode_symbol(sym, c->literal_codewords,
750                                           c->literal_lens, c->literal_freqs,
751                                           &c->os, &c->literal_rebuild_info);
752 }
753
754 static bool
755 lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym)
756 {
757         return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords,
758                                           c->lz_offset_lens, c->lz_offset_freqs,
759                                           &c->os, &c->lz_offset_rebuild_info);
760 }
761
762 static bool
763 lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym)
764 {
765         return lzms_huffman_encode_symbol(sym, c->length_codewords,
766                                           c->length_lens, c->length_freqs,
767                                           &c->os, &c->length_rebuild_info);
768 }
769
770 static bool
771 lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym)
772 {
773         return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords,
774                                           c->delta_offset_lens, c->delta_offset_freqs,
775                                           &c->os, &c->delta_offset_rebuild_info);
776 }
777
778 static bool
779 lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym)
780 {
781         return lzms_huffman_encode_symbol(sym, c->delta_power_codewords,
782                                           c->delta_power_lens, c->delta_power_freqs,
783                                           &c->os, &c->delta_power_rebuild_info);
784 }
785
786 static void
787 lzms_update_fast_length_costs(struct lzms_compressor *c);
788
789 /*
790  * Encode a match length.  If this causes the Huffman code for length symbols to
791  * be rebuilt, also update the length costs array used by the parser.
792  */
793 static void
794 lzms_encode_length(struct lzms_compressor *c, u32 length)
795 {
796         unsigned slot = lzms_comp_get_length_slot(c, length);
797
798         if (lzms_encode_length_symbol(c, slot))
799                 lzms_update_fast_length_costs(c);
800
801         lzms_write_bits(&c->os, length - lzms_length_slot_base[slot],
802                         lzms_extra_length_bits[slot],
803                         LZMS_MAX_EXTRA_LENGTH_BITS);
804 }
805
806 /* Encode the offset of an LZ match.  */
807 static void
808 lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset)
809 {
810         unsigned slot = lzms_comp_get_offset_slot(c, offset);
811
812         lzms_encode_lz_offset_symbol(c, slot);
813         lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot],
814                         lzms_extra_offset_bits[slot],
815                         LZMS_MAX_EXTRA_OFFSET_BITS);
816 }
817
818 /* Encode the raw offset of a delta match.  */
819 static void
820 lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset)
821 {
822         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
823
824         lzms_encode_delta_offset_symbol(c, slot);
825         lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot],
826                         lzms_extra_offset_bits[slot],
827                         LZMS_MAX_EXTRA_OFFSET_BITS);
828 }
829
830 /******************************************************************************
831  *                             Item encoding                                  *
832  ******************************************************************************/
833
834 /* Encode the specified item, which may be a literal or any type of match.  */
835 static void
836 lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source)
837 {
838         /* Main bit: 0 = literal, 1 = match  */
839         int main_bit = (length > 1);
840         lzms_encode_main_bit(c, main_bit);
841
842         if (!main_bit) {
843                 /* Literal  */
844                 unsigned literal = source;
845                 lzms_encode_literal_symbol(c, literal);
846         } else {
847                 /* Match  */
848
849                 /* Match bit: 0 = LZ match, 1 = delta match  */
850                 int match_bit = (source & DELTA_SOURCE_TAG) != 0;
851                 lzms_encode_match_bit(c, match_bit);
852
853                 if (!match_bit) {
854                         /* LZ match  */
855
856                         /* LZ bit: 0 = explicit offset, 1 = repeat offset  */
857                         int lz_bit = (source < LZMS_NUM_LZ_REPS);
858                         lzms_encode_lz_bit(c, lz_bit);
859
860                         if (!lz_bit) {
861                                 /* Explicit offset LZ match  */
862                                 u32 offset = source - (LZMS_NUM_LZ_REPS - 1);
863                                 lzms_encode_lz_offset(c, offset);
864                         } else {
865                                 /* Repeat offset LZ match  */
866                                 int rep_idx = source;
867                                 for (int i = 0; i < rep_idx; i++)
868                                         lzms_encode_lz_rep_bit(c, 1, i);
869                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
870                                         lzms_encode_lz_rep_bit(c, 0, rep_idx);
871                         }
872                 } else {
873                         /* Delta match  */
874
875                         source &= ~DELTA_SOURCE_TAG;
876
877                         /* Delta bit: 0 = explicit offset, 1 = repeat offset  */
878                         int delta_bit = (source < LZMS_NUM_DELTA_REPS);
879                         lzms_encode_delta_bit(c, delta_bit);
880
881                         if (!delta_bit) {
882                                 /* Explicit offset delta match  */
883                                 u32 power = source >> DELTA_SOURCE_POWER_SHIFT;
884                                 u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) -
885                                                  (LZMS_NUM_DELTA_REPS - 1);
886                                 lzms_encode_delta_power_symbol(c, power);
887                                 lzms_encode_delta_raw_offset(c, raw_offset);
888                         } else {
889                                 /* Repeat offset delta match  */
890                                 int rep_idx = source;
891                                 for (int i = 0; i < rep_idx; i++)
892                                         lzms_encode_delta_rep_bit(c, 1, i);
893                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
894                                         lzms_encode_delta_rep_bit(c, 0, rep_idx);
895                         }
896                 }
897
898                 /* Match length (encoded the same way for any match type)  */
899                 lzms_encode_length(c, length);
900         }
901 }
902
903 /* Encode a list of matches and literals chosen by the parsing algorithm.  */
904 static void
905 lzms_encode_nonempty_item_list(struct lzms_compressor *c,
906                                struct lzms_optimum_node *end_node)
907 {
908         /* Since we've stored at each node the item we took to arrive at that
909          * node, we can trace our chosen path in backwards order.  However, for
910          * encoding we need to trace our chosen path in forwards order.  To make
911          * this possible, the following loop moves the items from their
912          * destination nodes to their source nodes, which effectively reverses
913          * the path.  (Think of it like reversing a singly-linked list.)  */
914         struct lzms_optimum_node *cur_node = end_node;
915         struct lzms_item saved_item = cur_node->item;
916         do {
917                 struct lzms_item item = saved_item;
918                 if (cur_node->num_extra_items > 0) {
919                         /* Handle an arrival via multi-item lookahead.  */
920                         unsigned i = 0;
921                         struct lzms_optimum_node *orig_node = cur_node;
922                         do {
923                                 cur_node -= item.length;
924                                 cur_node->item = item;
925                                 item = orig_node->extra_items[i];
926                         } while (++i != orig_node->num_extra_items);
927                 }
928                 cur_node -= item.length;
929                 saved_item = cur_node->item;
930                 cur_node->item = item;
931         } while (cur_node != c->optimum_nodes);
932
933         /* Now trace the chosen path in forwards order, encoding each item.  */
934         do {
935                 lzms_encode_item(c, cur_node->item.length, cur_node->item.source);
936                 cur_node += cur_node->item.length;
937         } while (cur_node != end_node);
938 }
939
940 static inline void
941 lzms_encode_item_list(struct lzms_compressor *c,
942                       struct lzms_optimum_node *end_node)
943 {
944         if (end_node != c->optimum_nodes)
945                 lzms_encode_nonempty_item_list(c, end_node);
946 }
947
948 /******************************************************************************
949  *                             Cost evalution                                 *
950  ******************************************************************************/
951
952 /*
953  * If p is the predicted probability of the next bit being a 0, then the number
954  * of bits required to encode a 0 bit using a binary range encoder is the real
955  * number -log2(p), and the number of bits required to encode a 1 bit is the
956  * real number -log2(1 - p).  To avoid computing either of these expressions at
957  * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from
958  * probability to cost for each possible probability.  Specifically, the array
959  * indices are the numerators of the possible probabilities in LZMS, where the
960  * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the
961  * bit costs multiplied by 1<<COST_SHIFT and rounded to the nearest integer.
962  * Furthermore, the values stored for 0% and 100% probabilities are equal to the
963  * adjacent values, since these probabilities are not actually permitted.  This
964  * allows us to use the num_recent_zero_bits value from the
965  * lzms_probability_entry as the array index without fixing up these two special
966  * cases.
967  */
968 static const u32 lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR + 1] = {
969         384, 384, 320, 283, 256, 235, 219, 204,
970         192, 181, 171, 163, 155, 147, 140, 134,
971         128, 122, 117, 112, 107, 103, 99,  94,
972         91,  87,  83,  80,  76,  73,  70,  67,
973         64,  61,  58,  56,  53,  51,  48,  46,
974         43,  41,  39,  37,  35,  33,  30,  29,
975         27,  25,  23,  21,  19,  17,  16,  14,
976         12,  11,  9,   8,   6,   4,   3,   1,
977         1
978 };
979
980 static inline void
981 check_cost_shift(void)
982 {
983         /* lzms_bit_costs is hard-coded to the current COST_SHIFT.  */
984         BUILD_BUG_ON(COST_SHIFT != 6);
985 }
986
987 #if 0
988 #include <math.h>
989
990 static void
991 lzms_compute_bit_costs(void)
992 {
993         for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) {
994                 u32 prob = i;
995                 if (prob == 0)
996                         prob++;
997                 else if (prob == LZMS_PROBABILITY_DENOMINATOR)
998                         prob--;
999
1000                 lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) *
1001                                           (1 << COST_SHIFT));
1002         }
1003 }
1004 #endif
1005
1006 /* Return the cost to encode a 0 bit in the specified context.  */
1007 static inline u32
1008 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
1009 {
1010         return lzms_bit_costs[probs[state].num_recent_zero_bits];
1011 }
1012
1013 /* Return the cost to encode a 1 bit in the specified context.  */
1014 static inline u32
1015 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
1016 {
1017         return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
1018                               probs[state].num_recent_zero_bits];
1019 }
1020
1021 /* Return the cost to encode a literal, including the main bit.  */
1022 static inline u32
1023 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
1024 {
1025         return lzms_bit_0_cost(main_state, c->main_probs) +
1026                 ((u32)c->literal_lens[literal] << COST_SHIFT);
1027 }
1028
1029 /* Update 'fast_length_cost_tab' to use the latest Huffman code.  */
1030 static void
1031 lzms_update_fast_length_costs(struct lzms_compressor *c)
1032 {
1033         int slot = -1;
1034         u32 cost = 0;
1035         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
1036                 if (len >= lzms_length_slot_base[slot + 1]) {
1037                         slot++;
1038                         cost = (u32)(c->length_lens[slot] +
1039                                      lzms_extra_length_bits[slot]) << COST_SHIFT;
1040                 }
1041                 c->fast_length_cost_tab[len] = cost;
1042         }
1043 }
1044
1045 /* Return the cost to encode the specified match length, which must not exceed
1046  * MAX_FAST_LENGTH.  */
1047 static inline u32
1048 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
1049 {
1050         return c->fast_length_cost_tab[length];
1051 }
1052
1053 /* Return the cost to encode the specified LZ match offset.  */
1054 static inline u32
1055 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
1056 {
1057         unsigned slot = lzms_comp_get_offset_slot(c, offset);
1058         u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot];
1059         return num_bits << COST_SHIFT;
1060 }
1061
1062 /* Return the cost to encode the specified delta power and raw offset.  */
1063 static inline u32
1064 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
1065 {
1066         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
1067         u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] +
1068                        lzms_extra_offset_bits[slot];
1069         return num_bits << COST_SHIFT;
1070 }
1071
1072 /******************************************************************************
1073  *                              Adaptive state                                *
1074  ******************************************************************************/
1075
1076 static void
1077 lzms_init_adaptive_state(struct lzms_adaptive_state *state)
1078 {
1079         for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++)
1080                 state->recent_lz_offsets[i] = i + 1;
1081         state->prev_lz_offset = 0;
1082         state->upcoming_lz_offset = 0;
1083
1084         for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++)
1085                 state->recent_delta_pairs[i] = i + 1;
1086         state->prev_delta_pair = 0;
1087         state->upcoming_delta_pair = 0;
1088
1089         state->main_state = 0;
1090         state->match_state = 0;
1091         state->lz_state = 0;
1092         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
1093                 state->lz_rep_states[i] = 0;
1094         state->delta_state = 0;
1095         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
1096                 state->delta_rep_states[i] = 0;
1097 }
1098
1099 /*
1100  * Update the LRU queues for match sources when advancing by one item.
1101  *
1102  * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more
1103  * complex because:
1104  *      - there are separate queues for LZ and delta matches
1105  *      - updates to the queues are delayed by one encoded item (this prevents
1106  *        sources from being bumped up to index 0 too early)
1107  */
1108 static void
1109 lzms_update_lru_queues(struct lzms_adaptive_state *state)
1110 {
1111         if (state->prev_lz_offset != 0) {
1112                 for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--)
1113                         state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i];
1114                 state->recent_lz_offsets[0] = state->prev_lz_offset;
1115         }
1116         state->prev_lz_offset = state->upcoming_lz_offset;
1117
1118         if (state->prev_delta_pair != 0) {
1119                 for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--)
1120                         state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i];
1121                 state->recent_delta_pairs[0] = state->prev_delta_pair;
1122         }
1123         state->prev_delta_pair = state->upcoming_delta_pair;
1124 }
1125
1126 static inline void
1127 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
1128 {
1129         *state_p = ((*state_p << 1) | bit) % num_states;
1130 }
1131
1132 static inline void
1133 lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
1134 {
1135         lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
1136 }
1137
1138 static inline void
1139 lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
1140 {
1141         lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
1142 }
1143
1144 static inline void
1145 lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
1146 {
1147         lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
1148 }
1149
1150 static inline void
1151 lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1152 {
1153         for (int i = 0; i < rep_idx; i++)
1154                 lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS);
1155
1156         if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1157                 lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
1158 }
1159
1160 static inline void
1161 lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
1162 {
1163         lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
1164 }
1165
1166 static inline void
1167 lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1168 {
1169         for (int i = 0; i < rep_idx; i++)
1170                 lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS);
1171
1172         if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1173                 lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS);
1174 }
1175
1176 /******************************************************************************
1177  *                              Matchfinding                                  *
1178  ******************************************************************************/
1179
1180 /* Note: this code just handles finding delta matches.  The code for finding LZ
1181  * matches is elsewhere.  */
1182
1183
1184 /* Initialize the delta matchfinder for a new input buffer.  */
1185 static void
1186 lzms_init_delta_matchfinder(struct lzms_compressor *c)
1187 {
1188         /* Set all entries to use an invalid power, which will never match.  */
1189         BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER >= (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
1190         memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
1191
1192         /* Initialize the next hash code for each power.  We can just use zeroes
1193          * initially; it doesn't really matter.  */
1194         for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++)
1195                 c->next_delta_hashes[i] = 0;
1196 }
1197
1198 /*
1199  * Compute a DELTA_HASH_ORDER-bit hash code for the first
1200  * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
1201  * delta context with the specified @span.
1202  */
1203 static inline u32
1204 lzms_delta_hash(const u8 *p, u32 span)
1205 {
1206         /* A delta match has a certain span and an offset that is a multiple of
1207          * that span.  To reduce wasted space we use a single combined hash
1208          * table for all spans and positions, but to minimize collisions we
1209          * include in the hash code computation the span and the low-order bits
1210          * of the current position.  */
1211
1212         BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3);
1213         u8 d0 = *(p + 0) - *(p + 0 - span);
1214         u8 d1 = *(p + 1) - *(p + 1 - span);
1215         u8 d2 = *(p + 2) - *(p + 2 - span);
1216         u32 v = ((span + ((u32)(uintptr_t)p & (span - 1))) << 24) |
1217                 ((u32)d2 << 16) | ((u32)d1 << 8) | d0;
1218         return lz_hash(v, DELTA_HASH_ORDER);
1219 }
1220
1221 /*
1222  * Given a match between @in_next and @matchptr in a delta context with the
1223  * specified @span and having the initial @len, extend the match as far as
1224  * possible, up to a limit of @max_len.
1225  */
1226 static inline u32
1227 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
1228                         u32 len, u32 max_len, u32 span)
1229 {
1230         while (len < max_len &&
1231                (u8)(*(in_next + len) - *(in_next + len - span)) ==
1232                (u8)(*(matchptr + len) - *(matchptr + len - span)))
1233         {
1234                 len++;
1235         }
1236         return len;
1237 }
1238
1239 static void
1240 lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
1241                                   const u8 *in_next, u32 count)
1242 {
1243         u32 pos = in_next - c->in_buffer;
1244         if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1))
1245                 return;
1246         do {
1247                 /* Update the hash table for each power.  */
1248                 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1249                         const u32 span = (u32)1 << power;
1250                         if (unlikely(pos < span))
1251                                 continue;
1252                         const u32 next_hash = lzms_delta_hash(in_next + 1, span);
1253                         const u32 hash = c->next_delta_hashes[power];
1254                         c->delta_hash_table[hash] =
1255                                 (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1256                         c->next_delta_hashes[power] = next_hash;
1257                         prefetch(&c->delta_hash_table[next_hash]);
1258                 }
1259         } while (in_next++, pos++, --count);
1260 }
1261
1262 /*
1263  * Skip the next @count bytes (don't search for matches at them).  @in_next
1264  * points to the first byte to skip.  The return value is @in_next + count.
1265  */
1266 static const u8 *
1267 lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
1268 {
1269         lcpit_matchfinder_skip_bytes(&c->mf, count);
1270         if (c->use_delta_matches)
1271                 lzms_delta_matchfinder_skip_bytes(c, in_next, count);
1272         return in_next + count;
1273 }
1274
1275 /******************************************************************************
1276  *                          "Near-optimal" parsing                            *
1277  ******************************************************************************/
1278
1279 /*
1280  * The main near-optimal parsing routine.
1281  *
1282  * Briefly, the algorithm does an approximate minimum-cost path search to find a
1283  * "near-optimal" sequence of matches and literals to output, based on the
1284  * current cost model.  The algorithm steps forward, position by position (byte
1285  * by byte), and updates the minimum cost path to reach each later position that
1286  * can be reached using a match or literal from the current position.  This is
1287  * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
1288  * the graph edges are possible matches/literals to code, and the cost of each
1289  * edge is the estimated number of bits that will be required to output the
1290  * corresponding match or literal.  But one difference is that we actually
1291  * compute the lowest-cost path in pieces, where each piece is terminated when
1292  * there are no choices to be made.
1293  *
1294  * The costs of literals and matches are estimated using the range encoder
1295  * states and the semi-adaptive Huffman codes.  Except for range encoding
1296  * states, costs are assumed to be constant throughout a single run of the
1297  * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data.  This
1298  * introduces a source of non-optimality because the probabilities and Huffman
1299  * codes can change over this part of the data.  And of course, there are
1300  * various other reasons why the result isn't optimal in terms of compression
1301  * ratio.
1302  */
1303 static void
1304 lzms_near_optimal_parse(struct lzms_compressor *c)
1305 {
1306         const u8 *in_next = c->in_buffer;
1307         const u8 * const in_end = &c->in_buffer[c->in_nbytes];
1308         struct lzms_optimum_node *cur_node;
1309         struct lzms_optimum_node *end_node;
1310
1311         /* Set initial length costs for lengths <= MAX_FAST_LENGTH.  */
1312         lzms_update_fast_length_costs(c);
1313
1314         /* Set up the initial adaptive state.  */
1315         lzms_init_adaptive_state(&c->optimum_nodes[0].state);
1316
1317 begin:
1318         /* Start building a new list of items, which will correspond to the next
1319          * piece of the overall minimum-cost path.  */
1320
1321         cur_node = c->optimum_nodes;
1322         cur_node->cost = 0;
1323         end_node = cur_node;
1324
1325         if (in_next == in_end)
1326                 return;
1327
1328         /* The following loop runs once for each per byte in the input buffer,
1329          * except in a few shortcut cases.  */
1330         for (;;) {
1331                 u32 num_matches;
1332
1333                 /* Repeat offset LZ matches  */
1334                 if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS &&
1335                            in_end - in_next >= 2))
1336                 {
1337                         for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) {
1338
1339                                 /* Looking for a repeat offset LZ match at queue
1340                                  * index @rep_idx  */
1341
1342                                 const u32 offset = cur_node->state.recent_lz_offsets[rep_idx];
1343                                 const u8 * const matchptr = in_next - offset;
1344
1345                                 /* Check the first 2 bytes before entering the extension loop.  */
1346                                 if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr))
1347                                         continue;
1348
1349                                 /* Extend the match to its full length.  */
1350                                 const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next);
1351
1352                                 /* Early out for long repeat offset LZ match */
1353                                 if (rep_len >= c->mf.nice_match_len) {
1354
1355                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1356
1357                                         lzms_encode_item_list(c, cur_node);
1358                                         lzms_encode_item(c, rep_len, rep_idx);
1359
1360                                         c->optimum_nodes[0].state = cur_node->state;
1361                                         cur_node = &c->optimum_nodes[0];
1362
1363                                         cur_node->state.upcoming_lz_offset =
1364                                                 cur_node->state.recent_lz_offsets[rep_idx];
1365                                         cur_node->state.upcoming_delta_pair = 0;
1366                                         for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1367                                                 cur_node->state.recent_lz_offsets[i] =
1368                                                         cur_node->state.recent_lz_offsets[i + 1];
1369                                         lzms_update_lru_queues(&cur_node->state);
1370                                         lzms_update_main_state(&cur_node->state, 1);
1371                                         lzms_update_match_state(&cur_node->state, 0);
1372                                         lzms_update_lz_state(&cur_node->state, 1);
1373                                         lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1374                                         goto begin;
1375                                 }
1376
1377                                 while (end_node < cur_node + rep_len)
1378                                         (++end_node)->cost = INFINITE_COST;
1379
1380                                 u32 base_cost = cur_node->cost +
1381                                                 lzms_bit_1_cost(cur_node->state.main_state,
1382                                                                 c->main_probs) +
1383                                                 lzms_bit_0_cost(cur_node->state.match_state,
1384                                                                 c->match_probs) +
1385                                                 lzms_bit_1_cost(cur_node->state.lz_state,
1386                                                                 c->lz_probs);
1387
1388                                 for (int i = 0; i < rep_idx; i++)
1389                                         base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
1390                                                                      c->lz_rep_probs[i]);
1391
1392                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1393                                         base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
1394                                                                      c->lz_rep_probs[rep_idx]);
1395
1396                                 u32 len = 2;
1397                                 do {
1398                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1399                                         if (cost < (cur_node + len)->cost) {
1400                                                 (cur_node + len)->cost = cost;
1401                                                 (cur_node + len)->item = (struct lzms_item) {
1402                                                         .length = len,
1403                                                         .source = rep_idx,
1404                                                 };
1405                                                 (cur_node + len)->num_extra_items = 0;
1406                                         }
1407                                 } while (++len <= rep_len);
1408
1409
1410                                 /* try LZ-rep + lit + LZ-rep0  */
1411                                 if (c->try_lzrep_lit_lzrep0 &&
1412                                     in_end - (in_next + rep_len) >= 3 &&
1413                                     load_u16_unaligned(in_next + rep_len + 1) ==
1414                                     load_u16_unaligned(matchptr + rep_len + 1))
1415                                 {
1416                                         const u32 rep0_len = lz_extend(in_next + rep_len + 1,
1417                                                                        matchptr + rep_len + 1,
1418                                                                        2,
1419                                                                        min(c->mf.nice_match_len,
1420                                                                            in_end - (in_next + rep_len + 1)));
1421
1422                                         unsigned main_state = cur_node->state.main_state;
1423                                         unsigned match_state = cur_node->state.match_state;
1424                                         unsigned lz_state = cur_node->state.lz_state;
1425                                         unsigned lz_rep0_state = cur_node->state.lz_rep_states[0];
1426
1427                                         /* update states for LZ-rep  */
1428                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1429                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1430                                         lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS;
1431                                         lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) %
1432                                                                 LZMS_NUM_LZ_REP_PROBS;
1433
1434                                         /* LZ-rep cost  */
1435                                         u32 cost = base_cost + lzms_fast_length_cost(c, rep_len);
1436
1437                                         /* add literal cost  */
1438                                         cost += lzms_literal_cost(c, main_state, *(in_next + rep_len));
1439
1440                                         /* update state for literal  */
1441                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1442
1443                                         /* add LZ-rep0 cost  */
1444                                         cost += lzms_bit_1_cost(main_state, c->main_probs) +
1445                                                 lzms_bit_0_cost(match_state, c->match_probs) +
1446                                                 lzms_bit_1_cost(lz_state, c->lz_probs) +
1447                                                 lzms_bit_0_cost(lz_rep0_state, c->lz_rep_probs[0]) +
1448                                                 lzms_fast_length_cost(c, rep0_len);
1449
1450                                         const u32 total_len = rep_len + 1 + rep0_len;
1451
1452                                         while (end_node < cur_node + total_len)
1453                                                 (++end_node)->cost = INFINITE_COST;
1454
1455                                         if (cost < (cur_node + total_len)->cost) {
1456                                                 (cur_node + total_len)->cost = cost;
1457                                                 (cur_node + total_len)->item = (struct lzms_item) {
1458                                                         .length = rep0_len,
1459                                                         .source = 0,
1460                                                 };
1461                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1462                                                         .length = 1,
1463                                                         .source = *(in_next + rep_len),
1464                                                 };
1465                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1466                                                         .length = rep_len,
1467                                                         .source = rep_idx,
1468                                                 };
1469                                                 (cur_node + total_len)->num_extra_items = 2;
1470                                         }
1471                                 }
1472                         }
1473                 }
1474
1475                 /* Repeat offset delta matches  */
1476                 if (c->use_delta_matches &&
1477                     likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 &&
1478                            (in_end - in_next >= 2)))
1479                 {
1480                         for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) {
1481
1482                                 /* Looking for a repeat offset delta match at
1483                                  * queue index @rep_idx  */
1484
1485                                 const u32 pair = cur_node->state.recent_delta_pairs[rep_idx];
1486                                 const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT;
1487                                 const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK;
1488                                 const u32 span = (u32)1 << power;
1489                                 const u32 offset = raw_offset << power;
1490                                 const u8 * const matchptr = in_next - offset;
1491
1492                                 /* Check the first 2 bytes before entering the
1493                                  * extension loop.  */
1494                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1495                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1496                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1497                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))))
1498                                         continue;
1499
1500                                 /* Extend the match to its full length.  */
1501                                 const u32 rep_len = lzms_extend_delta_match(in_next, matchptr,
1502                                                                             2, in_end - in_next,
1503                                                                             span);
1504
1505                                 /* Early out for long repeat offset delta match */
1506                                 if (rep_len >= c->mf.nice_match_len) {
1507
1508                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1509
1510                                         lzms_encode_item_list(c, cur_node);
1511                                         lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx);
1512
1513                                         c->optimum_nodes[0].state = cur_node->state;
1514                                         cur_node = &c->optimum_nodes[0];
1515
1516                                         cur_node->state.upcoming_delta_pair = pair;
1517                                         cur_node->state.upcoming_lz_offset = 0;
1518                                         for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1519                                                 cur_node->state.recent_delta_pairs[i] =
1520                                                         cur_node->state.recent_delta_pairs[i + 1];
1521                                         lzms_update_lru_queues(&cur_node->state);
1522                                         lzms_update_main_state(&cur_node->state, 1);
1523                                         lzms_update_match_state(&cur_node->state, 1);
1524                                         lzms_update_delta_state(&cur_node->state, 1);
1525                                         lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1526                                         goto begin;
1527                                 }
1528
1529                                 while (end_node < cur_node + rep_len)
1530                                         (++end_node)->cost = INFINITE_COST;
1531
1532                                 u32 base_cost = cur_node->cost +
1533                                                 lzms_bit_1_cost(cur_node->state.main_state,
1534                                                                 c->main_probs) +
1535                                                 lzms_bit_1_cost(cur_node->state.match_state,
1536                                                                 c->match_probs) +
1537                                                 lzms_bit_1_cost(cur_node->state.delta_state,
1538                                                                 c->delta_probs);
1539
1540                                 for (int i = 0; i < rep_idx; i++)
1541                                         base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
1542                                                                      c->delta_rep_probs[i]);
1543
1544                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1545                                         base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
1546                                                                      c->delta_rep_probs[rep_idx]);
1547
1548                                 u32 len = 2;
1549                                 do {
1550                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1551                                         if (cost < (cur_node + len)->cost) {
1552                                                 (cur_node + len)->cost = cost;
1553                                                 (cur_node + len)->item = (struct lzms_item) {
1554                                                         .length = len,
1555                                                         .source = DELTA_SOURCE_TAG | rep_idx,
1556                                                 };
1557                                                 (cur_node + len)->num_extra_items = 0;
1558                                         }
1559                                 } while (++len <= rep_len);
1560                         }
1561                 }
1562
1563                 /* Explicit offset LZ matches  */
1564                 num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches);
1565                 if (num_matches) {
1566
1567                         u32 best_len = c->matches[0].length;
1568
1569                         /* Early out for long explicit offset LZ match  */
1570                         if (best_len >= c->mf.nice_match_len) {
1571
1572                                 const u32 offset = c->matches[0].offset;
1573
1574                                 /* Extend the match as far as possible.
1575                                  * This is necessary because the LCP-interval
1576                                  * tree matchfinder only reports up to
1577                                  * nice_match_len bytes.  */
1578                                 best_len = lz_extend(in_next, in_next - offset,
1579                                                      best_len, in_end - in_next);
1580
1581                                 in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1);
1582
1583                                 lzms_encode_item_list(c, cur_node);
1584                                 lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1);
1585
1586                                 c->optimum_nodes[0].state = cur_node->state;
1587                                 cur_node = &c->optimum_nodes[0];
1588
1589                                 cur_node->state.upcoming_lz_offset = offset;
1590                                 cur_node->state.upcoming_delta_pair = 0;
1591                                 lzms_update_lru_queues(&cur_node->state);
1592                                 lzms_update_main_state(&cur_node->state, 1);
1593                                 lzms_update_match_state(&cur_node->state, 0);
1594                                 lzms_update_lz_state(&cur_node->state, 0);
1595                                 goto begin;
1596                         }
1597
1598                         while (end_node < cur_node + best_len)
1599                                 (++end_node)->cost = INFINITE_COST;
1600
1601                         u32 base_cost = cur_node->cost +
1602                                         lzms_bit_1_cost(cur_node->state.main_state,
1603                                                         c->main_probs) +
1604                                         lzms_bit_0_cost(cur_node->state.match_state,
1605                                                         c->match_probs) +
1606                                         lzms_bit_0_cost(cur_node->state.lz_state,
1607                                                         c->lz_probs);
1608
1609                         if (c->try_lzmatch_lit_lzrep0 &&
1610                             likely(in_end - (in_next + c->matches[0].length) >= 3))
1611                         {
1612                                 /* try LZ-match + lit + LZ-rep0  */
1613
1614                                 u32 l = 2;
1615                                 u32 i = num_matches - 1;
1616                                 do {
1617                                         const u32 len = c->matches[i].length;
1618                                         const u32 offset = c->matches[i].offset;
1619                                         const u32 position_cost = base_cost +
1620                                                                   lzms_lz_offset_cost(c, offset);
1621                                         do {
1622                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1623                                                 if (cost < (cur_node + l)->cost) {
1624                                                         (cur_node + l)->cost = cost;
1625                                                         (cur_node + l)->item = (struct lzms_item) {
1626                                                                 .length = l,
1627                                                                 .source = offset + (LZMS_NUM_LZ_REPS - 1),
1628                                                         };
1629                                                         (cur_node + l)->num_extra_items = 0;
1630                                                 }
1631                                         } while (++l <= len);
1632
1633                                         const u8 * const matchptr = in_next - offset;
1634                                         if (load_u16_unaligned(matchptr + len + 1) !=
1635                                             load_u16_unaligned(in_next + len + 1))
1636                                                 continue;
1637
1638                                         const u32 rep0_len = lz_extend(in_next + len + 1,
1639                                                                        matchptr + len + 1,
1640                                                                        2,
1641                                                                        min(c->mf.nice_match_len,
1642                                                                            in_end - (in_next + len + 1)));
1643
1644                                         unsigned main_state = cur_node->state.main_state;
1645                                         unsigned match_state = cur_node->state.match_state;
1646                                         unsigned lz_state = cur_node->state.lz_state;
1647
1648                                         /* update states for LZ-match  */
1649                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1650                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1651                                         lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS;
1652
1653                                         /* LZ-match cost  */
1654                                         u32 cost = position_cost + lzms_fast_length_cost(c, len);
1655
1656                                         /* add literal cost  */
1657                                         cost += lzms_literal_cost(c, main_state, *(in_next + len));
1658
1659                                         /* update state for literal  */
1660                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1661
1662                                         /* add LZ-rep0 cost  */
1663                                         cost += lzms_bit_1_cost(main_state, c->main_probs) +
1664                                                 lzms_bit_0_cost(match_state, c->match_probs) +
1665                                                 lzms_bit_1_cost(lz_state, c->lz_probs) +
1666                                                 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1667                                                                 c->lz_rep_probs[0]) +
1668                                                 lzms_fast_length_cost(c, rep0_len);
1669
1670                                         const u32 total_len = len + 1 + rep0_len;
1671
1672                                         while (end_node < cur_node + total_len)
1673                                                 (++end_node)->cost = INFINITE_COST;
1674
1675                                         if (cost < (cur_node + total_len)->cost) {
1676                                                 (cur_node + total_len)->cost = cost;
1677                                                 (cur_node + total_len)->item = (struct lzms_item) {
1678                                                         .length = rep0_len,
1679                                                         .source = 0,
1680                                                 };
1681                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1682                                                         .length = 1,
1683                                                         .source = *(in_next + len),
1684                                                 };
1685                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1686                                                         .length = len,
1687                                                         .source = offset + LZMS_NUM_LZ_REPS - 1,
1688                                                 };
1689                                                 (cur_node + total_len)->num_extra_items = 2;
1690                                         }
1691                                 } while (i--);
1692                         } else {
1693                                 u32 l = 2;
1694                                 u32 i = num_matches - 1;
1695                                 do {
1696                                         u32 position_cost = base_cost +
1697                                                             lzms_lz_offset_cost(c, c->matches[i].offset);
1698                                         do {
1699                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1700                                                 if (cost < (cur_node + l)->cost) {
1701                                                         (cur_node + l)->cost = cost;
1702                                                         (cur_node + l)->item = (struct lzms_item) {
1703                                                                 .length = l,
1704                                                                 .source = c->matches[i].offset +
1705                                                                           (LZMS_NUM_LZ_REPS - 1),
1706                                                         };
1707                                                         (cur_node + l)->num_extra_items = 0;
1708                                                 }
1709                                         } while (++l <= c->matches[i].length);
1710                                 } while (i--);
1711                         }
1712                 }
1713
1714                 /* Explicit offset delta matches  */
1715                 if (c->use_delta_matches &&
1716                     likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1))
1717                 {
1718                         const u32 pos = in_next - c->in_buffer;
1719
1720                         /* Consider each possible power (log2 of span)  */
1721                         BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER > LZMS_NUM_DELTA_POWER_SYMS);
1722                         for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1723
1724                                 const u32 span = (u32)1 << power;
1725
1726                                 if (unlikely(pos < span))
1727                                         continue;
1728
1729                                 const u32 next_hash = lzms_delta_hash(in_next + 1, span);
1730                                 const u32 hash = c->next_delta_hashes[power];
1731                                 const u32 cur_match = c->delta_hash_table[hash];
1732
1733                                 c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1734                                 c->next_delta_hashes[power] = next_hash;
1735                                 prefetch(&c->delta_hash_table[next_hash]);
1736
1737                                 if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
1738                                         continue;
1739
1740                                 const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK);
1741
1742                                 /* The offset must be a multiple of span.  */
1743                                 if (offset & (span - 1))
1744                                         continue;
1745
1746                                 const u8 * const matchptr = in_next - offset;
1747
1748                                 /* Check the first 3 bytes before entering the
1749                                  * extension loop.  */
1750                                 BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3);
1751                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1752                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1753                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1754                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) ||
1755                                     ((u8)(*(in_next + 2) - *(in_next + 2 - span)) !=
1756                                      (u8)(*(matchptr + 2) - *(matchptr + 2 - span))))
1757                                         continue;
1758
1759                                 /* Extend the delta match to its full length.  */
1760                                 const u32 len = lzms_extend_delta_match(in_next,
1761                                                                         matchptr,
1762                                                                         3,
1763                                                                         in_end - in_next,
1764                                                                         span);
1765
1766                                 const u32 raw_offset = offset >> power;
1767                                 const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
1768                                                  raw_offset;
1769                                 const u32 source = DELTA_SOURCE_TAG |
1770                                                    (pair + LZMS_NUM_DELTA_REPS - 1);
1771
1772                                 /* Early out for long explicit offset delta match  */
1773                                 if (len >= c->mf.nice_match_len) {
1774
1775                                         in_next = lzms_skip_bytes(c, len - 1, in_next + 1);
1776
1777                                         lzms_encode_item_list(c, cur_node);
1778                                         lzms_encode_item(c, len, source);
1779
1780                                         c->optimum_nodes[0].state = cur_node->state;
1781                                         cur_node = &c->optimum_nodes[0];
1782
1783                                         cur_node->state.upcoming_lz_offset = 0;
1784                                         cur_node->state.upcoming_delta_pair = pair;
1785                                         lzms_update_lru_queues(&cur_node->state);
1786                                         lzms_update_main_state(&cur_node->state, 1);
1787                                         lzms_update_match_state(&cur_node->state, 1);
1788                                         lzms_update_delta_state(&cur_node->state, 0);
1789                                         goto begin;
1790                                 }
1791
1792                                 while (end_node < cur_node + len)
1793                                         (++end_node)->cost = INFINITE_COST;
1794
1795                                 u32 base_cost = cur_node->cost +
1796                                                 lzms_bit_1_cost(cur_node->state.main_state,
1797                                                                 c->main_probs) +
1798                                                 lzms_bit_1_cost(cur_node->state.match_state,
1799                                                                 c->match_probs) +
1800                                                 lzms_bit_0_cost(cur_node->state.delta_state,
1801                                                                 c->delta_probs) +
1802                                                 lzms_delta_source_cost(c, power, raw_offset);
1803
1804                                 u32 l = NBYTES_HASHED_FOR_DELTA;
1805                                 do {
1806                                         u32 cost = base_cost + lzms_fast_length_cost(c, l);
1807                                         if (cost < (cur_node + l)->cost) {
1808                                                 (cur_node + l)->cost = cost;
1809                                                 (cur_node + l)->item = (struct lzms_item) {
1810                                                         .length = l,
1811                                                         .source = source,
1812                                                 };
1813                                                 (cur_node + l)->num_extra_items = 0;
1814                                         }
1815                                 } while (++l <= len);
1816                         }
1817                 }
1818
1819                 /* Literal  */
1820                 if (end_node < cur_node + 1)
1821                         (++end_node)->cost = INFINITE_COST;
1822                 const u32 cur_and_lit_cost = cur_node->cost +
1823                                              lzms_literal_cost(c, cur_node->state.main_state,
1824                                                                *in_next);
1825                 if (cur_and_lit_cost < (cur_node + 1)->cost) {
1826                         (cur_node + 1)->cost = cur_and_lit_cost;
1827                         (cur_node + 1)->item = (struct lzms_item) {
1828                                 .length = 1,
1829                                 .source = *in_next,
1830                         };
1831                         (cur_node + 1)->num_extra_items = 0;
1832                 } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) {
1833                         /* try lit + LZ-rep0  */
1834                         const u32 offset =
1835                                 (cur_node->state.prev_lz_offset) ?
1836                                         cur_node->state.prev_lz_offset :
1837                                         cur_node->state.recent_lz_offsets[0];
1838
1839                         if (load_u16_unaligned(in_next + 1) ==
1840                             load_u16_unaligned(in_next + 1 - offset))
1841                         {
1842                                 const u32 rep0_len = lz_extend(in_next + 1,
1843                                                                in_next + 1 - offset,
1844                                                                2,
1845                                                                min(in_end - (in_next + 1),
1846                                                                    c->mf.nice_match_len));
1847
1848                                 unsigned main_state = cur_node->state.main_state;
1849
1850                                 /* Update main_state after literal  */
1851                                 main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS;
1852
1853                                 /* Add cost of LZ-rep0  */
1854                                 const u32 cost = cur_and_lit_cost +
1855                                                  lzms_bit_1_cost(main_state, c->main_probs) +
1856                                                  lzms_bit_0_cost(cur_node->state.match_state,
1857                                                                  c->match_probs) +
1858                                                  lzms_bit_1_cost(cur_node->state.lz_state,
1859                                                                  c->lz_probs) +
1860                                                  lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1861                                                                  c->lz_rep_probs[0]) +
1862                                                  lzms_fast_length_cost(c, rep0_len);
1863
1864                                 const u32 total_len = 1 + rep0_len;
1865
1866                                 while (end_node < cur_node + total_len)
1867                                         (++end_node)->cost = INFINITE_COST;
1868
1869                                 if (cost < (cur_node + total_len)->cost) {
1870                                         (cur_node + total_len)->cost = cost;
1871                                         (cur_node + total_len)->item = (struct lzms_item) {
1872                                                 .length = rep0_len,
1873                                                 .source = 0,
1874                                         };
1875                                         (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1876                                                 .length = 1,
1877                                                 .source = *in_next,
1878                                         };
1879                                         (cur_node + total_len)->num_extra_items = 1;
1880                                 }
1881                         }
1882                 }
1883
1884                 /* Advance to the next position.  */
1885                 in_next++;
1886                 cur_node++;
1887
1888                 /* The lowest-cost path to the current position is now known.
1889                  * Finalize the adaptive state that results from taking this
1890                  * lowest-cost path.  */
1891                 struct lzms_item item_to_take = cur_node->item;
1892                 struct lzms_optimum_node *source_node = cur_node - (item_to_take.length);
1893                 int next_item_idx = -1;
1894                 for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
1895                         item_to_take = cur_node->extra_items[i];
1896                         source_node -= item_to_take.length;
1897                         next_item_idx++;
1898                 }
1899                 cur_node->state = source_node->state;
1900                 for (;;) {
1901                         const u32 length = item_to_take.length;
1902                         u32 source = item_to_take.source;
1903
1904                         cur_node->state.upcoming_lz_offset = 0;
1905                         cur_node->state.upcoming_delta_pair = 0;
1906                         if (length > 1) {
1907                                 /* Match  */
1908
1909                                 lzms_update_main_state(&cur_node->state, 1);
1910
1911                                 if (source & DELTA_SOURCE_TAG) {
1912                                         /* Delta match  */
1913
1914                                         lzms_update_match_state(&cur_node->state, 1);
1915                                         source &= ~DELTA_SOURCE_TAG;
1916
1917                                         if (source >= LZMS_NUM_DELTA_REPS) {
1918                                                 /* Explicit offset delta match  */
1919                                                 u32 pair = source - (LZMS_NUM_DELTA_REPS - 1);
1920                                                 lzms_update_delta_state(&cur_node->state, 0);
1921                                                 cur_node->state.upcoming_delta_pair = pair;
1922                                         } else {
1923                                                 /* Repeat offset delta match  */
1924                                                 int rep_idx = source;
1925
1926                                                 lzms_update_delta_state(&cur_node->state, 1);
1927                                                 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1928
1929                                                 cur_node->state.upcoming_delta_pair =
1930                                                         cur_node->state.recent_delta_pairs[rep_idx];
1931
1932                                                 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1933                                                         cur_node->state.recent_delta_pairs[i] =
1934                                                                 cur_node->state.recent_delta_pairs[i + 1];
1935                                         }
1936                                 } else {
1937                                         lzms_update_match_state(&cur_node->state, 0);
1938
1939                                         if (source >= LZMS_NUM_LZ_REPS) {
1940                                                 /* Explicit offset LZ match  */
1941                                                 lzms_update_lz_state(&cur_node->state, 0);
1942                                                 cur_node->state.upcoming_lz_offset =
1943                                                         source - (LZMS_NUM_LZ_REPS - 1);
1944                                         } else {
1945                                                 /* Repeat offset LZ match  */
1946                                                 int rep_idx = source;
1947
1948                                                 lzms_update_lz_state(&cur_node->state, 1);
1949                                                 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1950
1951                                                 cur_node->state.upcoming_lz_offset =
1952                                                         cur_node->state.recent_lz_offsets[rep_idx];
1953
1954                                                 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1955                                                         cur_node->state.recent_lz_offsets[i] =
1956                                                                 cur_node->state.recent_lz_offsets[i + 1];
1957                                         }
1958                                 }
1959                         } else {
1960                                 /* Literal  */
1961                                 lzms_update_main_state(&cur_node->state, 0);
1962                         }
1963
1964                         lzms_update_lru_queues(&cur_node->state);
1965
1966                         if (next_item_idx < 0)
1967                                 break;
1968                         if (next_item_idx == 0)
1969                                 item_to_take = cur_node->item;
1970                         else
1971                                 item_to_take = cur_node->extra_items[next_item_idx - 1];
1972                         --next_item_idx;
1973                 }
1974
1975                 /*
1976                  * This loop will terminate when either of the following
1977                  * conditions is true:
1978                  *
1979                  * (1) cur_node == end_node
1980                  *
1981                  *      There are no paths that extend beyond the current
1982                  *      position.  In this case, any path to a later position
1983                  *      must pass through the current position, so we can go
1984                  *      ahead and choose the list of items that led to this
1985                  *      position.
1986                  *
1987                  * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]
1988                  *
1989                  *      This bounds the number of times the algorithm can step
1990                  *      forward before it is guaranteed to start choosing items.
1991                  *      This limits the memory usage.  It also guarantees that
1992                  *      the parser will not go too long without updating the
1993                  *      probability tables.
1994                  *
1995                  * Note: no check for end-of-buffer is needed because
1996                  * end-of-buffer will trigger condition (1).
1997                  */
1998                 if (cur_node == end_node ||
1999                     cur_node == &c->optimum_nodes[NUM_OPTIM_NODES])
2000                 {
2001                         lzms_encode_nonempty_item_list(c, cur_node);
2002                         c->optimum_nodes[0].state = cur_node->state;
2003                         goto begin;
2004                 }
2005         }
2006 }
2007
2008 static void
2009 lzms_init_states_and_probabilities(struct lzms_compressor *c)
2010 {
2011         c->main_state = 0;
2012         c->match_state = 0;
2013         c->lz_state = 0;
2014         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2015                 c->lz_rep_states[i] = 0;
2016         c->delta_state = 0;
2017         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2018                 c->delta_rep_states[i] = 0;
2019
2020         lzms_init_probability_entries(c->main_probs, LZMS_NUM_MAIN_PROBS);
2021         lzms_init_probability_entries(c->match_probs, LZMS_NUM_MATCH_PROBS);
2022         lzms_init_probability_entries(c->lz_probs, LZMS_NUM_LZ_PROBS);
2023         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2024                 lzms_init_probability_entries(c->lz_rep_probs[i], LZMS_NUM_LZ_REP_PROBS);
2025         lzms_init_probability_entries(c->delta_probs, LZMS_NUM_DELTA_PROBS);
2026         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2027                 lzms_init_probability_entries(c->delta_rep_probs[i], LZMS_NUM_DELTA_REP_PROBS);
2028 }
2029
2030 static void
2031 lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots)
2032 {
2033         lzms_init_huffman_code(&c->literal_rebuild_info,
2034                                LZMS_NUM_LITERAL_SYMS,
2035                                LZMS_LITERAL_CODE_REBUILD_FREQ,
2036                                c->literal_codewords,
2037                                c->literal_lens,
2038                                c->literal_freqs);
2039
2040         lzms_init_huffman_code(&c->lz_offset_rebuild_info,
2041                                num_offset_slots,
2042                                LZMS_LZ_OFFSET_CODE_REBUILD_FREQ,
2043                                c->lz_offset_codewords,
2044                                c->lz_offset_lens,
2045                                c->lz_offset_freqs);
2046
2047         lzms_init_huffman_code(&c->length_rebuild_info,
2048                                LZMS_NUM_LENGTH_SYMS,
2049                                LZMS_LENGTH_CODE_REBUILD_FREQ,
2050                                c->length_codewords,
2051                                c->length_lens,
2052                                c->length_freqs);
2053
2054         lzms_init_huffman_code(&c->delta_offset_rebuild_info,
2055                                num_offset_slots,
2056                                LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ,
2057                                c->delta_offset_codewords,
2058                                c->delta_offset_lens,
2059                                c->delta_offset_freqs);
2060
2061         lzms_init_huffman_code(&c->delta_power_rebuild_info,
2062                                LZMS_NUM_DELTA_POWER_SYMS,
2063                                LZMS_DELTA_POWER_CODE_REBUILD_FREQ,
2064                                c->delta_power_codewords,
2065                                c->delta_power_lens,
2066                                c->delta_power_freqs);
2067 }
2068
2069 /*
2070  * Flush the output streams, prepare the final compressed data, and return its
2071  * size in bytes.
2072  *
2073  * A return value of 0 indicates that the data could not be compressed to fit in
2074  * the available space.
2075  */
2076 static size_t
2077 lzms_finalize(struct lzms_compressor *c)
2078 {
2079         size_t num_forwards_units;
2080         size_t num_backwards_units;
2081
2082         /* Flush both the forwards and backwards streams, and make sure they
2083          * didn't cross each other and start overwriting each other's data.  */
2084         if (!lzms_output_bitstream_flush(&c->os))
2085                 return 0;
2086
2087         if (!lzms_range_encoder_flush(&c->rc))
2088                 return 0;
2089
2090         if (c->rc.next > c->os.next)
2091                 return 0;
2092
2093         /* Now the compressed buffer contains the data output by the forwards
2094          * bitstream, then empty space, then data output by the backwards
2095          * bitstream.  Move the data output by the backwards bitstream to be
2096          * adjacent to the data output by the forward bitstream, and calculate
2097          * the compressed size that this results in.  */
2098         num_forwards_units = c->rc.next - c->rc.begin;
2099         num_backwards_units = c->rc.end - c->os.next;
2100
2101         memmove(c->rc.next, c->os.next, num_backwards_units * sizeof(le16));
2102
2103         return (num_forwards_units + num_backwards_units) * sizeof(le16);
2104 }
2105
2106 static u64
2107 lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2108                        bool destructive)
2109 {
2110         u64 size = 0;
2111
2112         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2113                 return 0;
2114
2115         size += sizeof(struct lzms_compressor);
2116
2117         if (!destructive)
2118                 size += max_bufsize; /* in_buffer */
2119
2120         /* mf */
2121         size += lcpit_matchfinder_get_needed_memory(max_bufsize);
2122
2123         return size;
2124 }
2125
2126 static int
2127 lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
2128                        bool destructive, void **c_ret)
2129 {
2130         struct lzms_compressor *c;
2131         u32 nice_match_len;
2132
2133         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2134                 return WIMLIB_ERR_INVALID_PARAM;
2135
2136         c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64);
2137         if (!c)
2138                 goto oom0;
2139
2140         c->destructive = destructive;
2141
2142         /* Scale nice_match_len with the compression level.  But to allow an
2143          * optimization for length cost calculations, don't allow nice_match_len
2144          * to exceed MAX_FAST_LENGTH.  */
2145         nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH);
2146
2147         c->use_delta_matches = (compression_level >= 35);
2148         c->try_lzmatch_lit_lzrep0 = (compression_level >= 45);
2149         c->try_lit_lzrep0 = (compression_level >= 60);
2150         c->try_lzrep_lit_lzrep0 = (compression_level >= 60);
2151
2152         if (!c->destructive) {
2153                 c->in_buffer = MALLOC(max_bufsize);
2154                 if (!c->in_buffer)
2155                         goto oom1;
2156         }
2157
2158         if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len))
2159                 goto oom2;
2160
2161         lzms_init_fast_length_slot_tab(c);
2162         lzms_init_offset_slot_tabs(c);
2163
2164         *c_ret = c;
2165         return 0;
2166
2167 oom2:
2168         if (!c->destructive)
2169                 FREE(c->in_buffer);
2170 oom1:
2171         ALIGNED_FREE(c);
2172 oom0:
2173         return WIMLIB_ERR_NOMEM;
2174 }
2175
2176 static size_t
2177 lzms_compress(const void *in, size_t in_nbytes,
2178               void *out, size_t out_nbytes_avail, void *_c)
2179 {
2180         struct lzms_compressor *c = _c;
2181         size_t result;
2182
2183         /* Don't bother trying to compress extremely small inputs.  */
2184         if (in_nbytes < 4)
2185                 return 0;
2186
2187         /* Copy the input data into the internal buffer and preprocess it.  */
2188         if (c->destructive)
2189                 c->in_buffer = (void *)in;
2190         else
2191                 memcpy(c->in_buffer, in, in_nbytes);
2192         c->in_nbytes = in_nbytes;
2193         lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false);
2194
2195         /* Prepare the matchfinders.  */
2196         lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes);
2197         if (c->use_delta_matches)
2198                 lzms_init_delta_matchfinder(c);
2199
2200         /* Initialize the encoder structures.  */
2201         lzms_range_encoder_init(&c->rc, out, out_nbytes_avail / sizeof(le16));
2202         lzms_output_bitstream_init(&c->os, out, out_nbytes_avail / sizeof(le16));
2203         lzms_init_states_and_probabilities(c);
2204         lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes));
2205
2206         /* The main loop: parse and encode.  */
2207         lzms_near_optimal_parse(c);
2208
2209         /* Return the compressed data size or 0.  */
2210         result = lzms_finalize(c);
2211         if (!result && c->destructive)
2212                 lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true);
2213         return result;
2214 }
2215
2216 static void
2217 lzms_free_compressor(void *_c)
2218 {
2219         struct lzms_compressor *c = _c;
2220
2221         if (!c->destructive)
2222                 FREE(c->in_buffer);
2223         lcpit_matchfinder_destroy(&c->mf);
2224         ALIGNED_FREE(c);
2225 }
2226
2227 const struct compressor_ops lzms_compressor_ops = {
2228         .get_needed_memory  = lzms_get_needed_memory,
2229         .create_compressor  = lzms_create_compressor,
2230         .compress           = lzms_compress,
2231         .free_compressor    = lzms_free_compressor,
2232 };