]> wimlib.net Git - wimlib/blob - src/lzms_compress.c
8db49668d296bf8d0340235ae78a4bc78aa372cf
[wimlib] / src / lzms_compress.c
1 /*
2  * lzms_compress.c
3  *
4  * A compressor for the LZMS compression format.
5  */
6
7 /*
8  * Copyright (C) 2013, 2014, 2015 Eric Biggers
9  *
10  * This file is free software; you can redistribute it and/or modify it under
11  * the terms of the GNU Lesser General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option) any
13  * later version.
14  *
15  * This file is distributed in the hope that it will be useful, but WITHOUT
16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this file; if not, see http://www.gnu.org/licenses/.
22  */
23
24 #ifdef HAVE_CONFIG_H
25 #  include "config.h"
26 #endif
27
28 #include <limits.h>
29 #include <string.h>
30
31 #include "wimlib/compress_common.h"
32 #include "wimlib/compressor_ops.h"
33 #include "wimlib/error.h"
34 #include "wimlib/lcpit_matchfinder.h"
35 #include "wimlib/lz_extend.h"
36 #include "wimlib/lz_hash.h"
37 #include "wimlib/lzms_common.h"
38 #include "wimlib/unaligned.h"
39 #include "wimlib/util.h"
40
41 /*
42  * MAX_FAST_LENGTH is the maximum match length for which the length slot can be
43  * looked up directly in 'fast_length_slot_tab' and the length cost can be
44  * looked up directly in 'fast_length_cost_tab'.
45  *
46  * We also limit the 'nice_match_len' parameter to this value.  Consequently, if
47  * the longest match found is shorter than 'nice_match_len', then it must also
48  * be shorter than MAX_FAST_LENGTH.  This makes it possible to do fast lookups
49  * of length costs using 'fast_length_cost_tab' without having to keep checking
50  * whether the length exceeds MAX_FAST_LENGTH or not.
51  */
52 #define MAX_FAST_LENGTH         255
53
54 /* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will
55  * step forward before forcing the pending items to be encoded.  If this value
56  * is increased, then there will be fewer forced flushes, but the probability
57  * entries and Huffman codes will be more likely to become outdated.  */
58 #define NUM_OPTIM_NODES         2048
59
60 /* COST_SHIFT is a scaling factor that makes it possible to consider fractional
61  * bit costs.  A single bit has a cost of (1 << COST_SHIFT).  */
62 #define COST_SHIFT              6
63
64 /* Length of the hash table for finding delta matches  */
65 #define DELTA_HASH_ORDER        17
66 #define DELTA_HASH_LENGTH       ((u32)1 << DELTA_HASH_ORDER)
67
68 /* The number of bytes to hash when finding delta matches; also taken to be the
69  * minimum length of an explicit offset delta match  */
70 #define NBYTES_HASHED_FOR_DELTA 3
71
72 /* The number of delta match powers to consider (must be <=
73  * LZMS_NUM_DELTA_POWER_SYMS)  */
74 #define NUM_POWERS_TO_CONSIDER  6
75
76 /* This structure tracks the state of writing bits as a series of 16-bit coding
77  * units, starting at the end of the output buffer and proceeding backwards.  */
78 struct lzms_output_bitstream {
79
80         /* Bits that haven't yet been written to the output buffer  */
81         u64 bitbuf;
82
83         /* Number of bits currently held in @bitbuf  */
84         unsigned bitcount;
85
86         /* Pointer to one past the next position in the output buffer at which
87          * to output a 16-bit coding unit  */
88         le16 *next;
89
90         /* Pointer to the beginning of the output buffer (this is the "end" when
91          * writing backwards!)  */
92         le16 *begin;
93 };
94
95 /* This structure tracks the state of range encoding and its output, which
96  * starts at the beginning of the output buffer and proceeds forwards.  */
97 struct lzms_range_encoder {
98
99         /* The lower boundary of the current range.  Logically, this is a 33-bit
100          * integer whose high bit is needed to detect carries.  */
101         u64 lower_bound;
102
103         /* The size of the current range  */
104         u32 range_size;
105
106         /* The next 16-bit coding unit to output  */
107         u16 cache;
108
109         /* The number of 16-bit coding units whose output has been delayed due
110          * to possible carrying.  The first such coding unit is @cache; all
111          * subsequent such coding units are 0xffff.  */
112         u32 cache_size;
113
114         /* Pointer to the beginning of the output buffer  */
115         le16 *begin;
116
117         /* Pointer to the position in the output buffer at which the next coding
118          * unit must be written  */
119         le16 *next;
120
121         /* Pointer to just past the end of the output buffer  */
122         le16 *end;
123 };
124
125 /* Bookkeeping information for an adaptive Huffman code  */
126 struct lzms_huffman_rebuild_info {
127
128         /* The remaining number of symbols to encode until this code must be
129          * rebuilt  */
130         unsigned num_syms_until_rebuild;
131
132         /* The number of symbols in this code  */
133         unsigned num_syms;
134
135         /* The rebuild frequency of this code, in symbols  */
136         unsigned rebuild_freq;
137
138         /* The Huffman codeword of each symbol in this code  */
139         u32 *codewords;
140
141         /* The length of each Huffman codeword, in bits  */
142         u8 *lens;
143
144         /* The frequency of each symbol in this code  */
145         u32 *freqs;
146 };
147
148 /*
149  * The compressor-internal representation of a match or literal.
150  *
151  * Literals have length=1; matches have length > 1.  (We disallow matches of
152  * length 1, even though this is a valid length in LZMS.)
153  *
154  * The source is encoded as follows:
155  *
156  * - Literals: the literal byte itself
157  * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1)
158  * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets
159  * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are
160  *   the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1)
161  * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is
162  *   the index of the (power, raw_offset) pair in recent_delta_pairs
163  */
164 struct lzms_item {
165         u32 length;
166         u32 source;
167 };
168
169 #define DELTA_SOURCE_TAG                ((u32)1 << 31)
170 #define DELTA_SOURCE_POWER_SHIFT        28
171 #define DELTA_SOURCE_RAW_OFFSET_MASK    (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1)
172
173 static inline void
174 check_that_powers_fit_in_bitfield(void)
175 {
176         BUILD_BUG_ON(LZMS_NUM_DELTA_POWER_SYMS > (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
177 }
178
179 /* A stripped-down version of the adaptive state in LZMS which excludes the
180  * probability entries and Huffman codes  */
181 struct lzms_adaptive_state {
182
183         /* Recent offsets for LZ matches  */
184         u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1];
185         u32 prev_lz_offset; /* 0 means none */
186         u32 upcoming_lz_offset; /* 0 means none */
187
188         /* Recent (power, raw offset) pairs for delta matches.
189          * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw
190          * offset, and the high bits are the power.  */
191         u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1];
192         u32 prev_delta_pair; /* 0 means none */
193         u32 upcoming_delta_pair; /* 0 means none  */
194
195         /* States for predicting the probabilities of item types  */
196         u8 main_state;
197         u8 match_state;
198         u8 lz_state;
199         u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
200         u8 delta_state;
201         u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
202 } _aligned_attribute(64);
203
204 /*
205  * This structure represents a byte position in the preprocessed input data and
206  * a node in the graph of possible match/literal choices.
207  *
208  * Logically, each incoming edge to this node is labeled with a literal or a
209  * match that can be taken to reach this position from an earlier position; and
210  * each outgoing edge from this node is labeled with a literal or a match that
211  * can be taken to advance from this position to a later position.
212  */
213 struct lzms_optimum_node {
214
215         /*
216          * The cost of the lowest-cost path that has been found to reach this
217          * position.  This can change as progressively lower cost paths are
218          * found to reach this position.
219          */
220         u32 cost;
221 #define INFINITE_COST UINT32_MAX
222
223         /*
224          * @item is the last item that was taken to reach this position to reach
225          * it with the stored @cost.  This can change as progressively lower
226          * cost paths are found to reach this position.
227          *
228          * In some cases we look ahead more than one item.  If we looked ahead n
229          * items to reach this position, then @item is the last item taken,
230          * @extra_items contains the other items ordered from second-to-last to
231          * first, and @num_extra_items is n - 1.
232          */
233         unsigned num_extra_items;
234         struct lzms_item item;
235         struct lzms_item extra_items[2];
236
237         /*
238          * The adaptive state that exists at this position.  This is filled in
239          * lazily, only after the minimum-cost path to this position is found.
240          *
241          * Note: the way the algorithm handles this adaptive state in the
242          * "minimum-cost" parse is actually only an approximation.  It's
243          * possible for the globally optimal, minimum cost path to contain a
244          * prefix, ending at a position, where that path prefix is *not* the
245          * minimum cost path to that position.  This can happen if such a path
246          * prefix results in a different adaptive state which results in lower
247          * costs later.  Although the algorithm does do some heuristic
248          * multi-item lookaheads, it does not solve this problem in general.
249          *
250          * Note: this adaptive state structure also does not include the
251          * probability entries or current Huffman codewords.  Those aren't
252          * maintained per-position and are only updated occassionally.
253          */
254         struct lzms_adaptive_state state;
255 } _aligned_attribute(64);
256
257 /* The main compressor structure  */
258 struct lzms_compressor {
259
260         /* The matchfinder for LZ matches  */
261         struct lcpit_matchfinder mf;
262
263         /* The preprocessed buffer of data being compressed  */
264         u8 *in_buffer;
265
266         /* The number of bytes of data to be compressed, which is the number of
267          * bytes of data in @in_buffer that are actually valid  */
268         size_t in_nbytes;
269
270         /*
271          * Boolean flags to enable consideration of various types of multi-step
272          * operations during parsing.
273          *
274          * Among other cases, multi-step operations can help with gaps where two
275          * matches are separated by a non-matching byte.
276          *
277          * This idea is borrowed from Igor Pavlov's LZMA encoder.
278          */
279         bool try_lit_lzrep0;
280         bool try_lzrep_lit_lzrep0;
281         bool try_lzmatch_lit_lzrep0;
282
283         /*
284          * If true, the compressor can use delta matches.  This slows down
285          * compression.  It improves the compression ratio greatly, slightly, or
286          * not at all, depending on the input data.
287          */
288         bool use_delta_matches;
289
290         /* If true, the compressor need not preserve the input buffer if it
291          * compresses the data successfully.  */
292         bool destructive;
293
294         /* 'last_target_usages' is a large array that is only needed for
295          * preprocessing, so it is in union with fields that don't need to be
296          * initialized until after preprocessing.  */
297         union {
298         struct {
299
300         /* Temporary space to store matches found by the LZ matchfinder  */
301         struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1];
302
303         /* Hash table for finding delta matches  */
304         u32 delta_hash_table[DELTA_HASH_LENGTH];
305
306         /* For each delta power, the hash code for the next sequence  */
307         u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
308
309         /* The per-byte graph nodes for near-optimal parsing  */
310         struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH +
311                                                1 + MAX_FAST_LENGTH];
312
313         /* Table: length => current cost for small match lengths  */
314         u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
315
316         /* Range encoder which outputs to the beginning of the compressed data
317          * buffer, proceeding forwards  */
318         struct lzms_range_encoder rc;
319
320         /* Bitstream which outputs to the end of the compressed data buffer,
321          * proceeding backwards  */
322         struct lzms_output_bitstream os;
323
324         /* States and probability entries for item type disambiguation  */
325         unsigned main_state;
326         unsigned match_state;
327         unsigned lz_state;
328         unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
329         unsigned delta_state;
330         unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
331         struct lzms_probability_entry main_probs[LZMS_NUM_MAIN_PROBS];
332         struct lzms_probability_entry match_probs[LZMS_NUM_MATCH_PROBS];
333         struct lzms_probability_entry lz_probs[LZMS_NUM_LZ_PROBS];
334         struct lzms_probability_entry lz_rep_probs[LZMS_NUM_LZ_REP_DECISIONS]
335                                                   [LZMS_NUM_LZ_REP_PROBS];
336         struct lzms_probability_entry delta_probs[LZMS_NUM_DELTA_PROBS];
337         struct lzms_probability_entry delta_rep_probs[LZMS_NUM_DELTA_REP_DECISIONS]
338                                                      [LZMS_NUM_DELTA_REP_PROBS];
339
340         /* Huffman codes  */
341
342         struct lzms_huffman_rebuild_info literal_rebuild_info;
343         u32 literal_codewords[LZMS_NUM_LITERAL_SYMS];
344         u8 literal_lens[LZMS_NUM_LITERAL_SYMS];
345         u32 literal_freqs[LZMS_NUM_LITERAL_SYMS];
346
347         struct lzms_huffman_rebuild_info lz_offset_rebuild_info;
348         u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
349         u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
350         u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
351
352         struct lzms_huffman_rebuild_info length_rebuild_info;
353         u32 length_codewords[LZMS_NUM_LENGTH_SYMS];
354         u8 length_lens[LZMS_NUM_LENGTH_SYMS];
355         u32 length_freqs[LZMS_NUM_LENGTH_SYMS];
356
357         struct lzms_huffman_rebuild_info delta_offset_rebuild_info;
358         u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
359         u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
360         u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
361
362         struct lzms_huffman_rebuild_info delta_power_rebuild_info;
363         u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS];
364         u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS];
365         u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS];
366
367         }; /* struct */
368
369         s32 last_target_usages[65536];
370
371         }; /* union */
372
373         /* Table: length => length slot for small match lengths  */
374         u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1];
375
376         /* Tables for mapping offsets to offset slots  */
377
378         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
379         u8 offset_slot_tab_1[0xe4a5];
380
381         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
382         u16 offset_slot_tab_2[0x3d0000 >> 11];
383
384         /* slots [427, 799); 16 <= num_extra_bits  */
385         u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16];
386 };
387
388 /******************************************************************************
389  *                   Offset and length slot acceleration                      *
390  ******************************************************************************/
391
392 /* Generate the acceleration table for length slots.  */
393 static void
394 lzms_init_fast_length_slot_tab(struct lzms_compressor *c)
395 {
396         unsigned slot = 0;
397         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
398                 if (len >= lzms_length_slot_base[slot + 1])
399                         slot++;
400                 c->fast_length_slot_tab[len] = slot;
401         }
402 }
403
404 /* Generate the acceleration tables for offset slots.  */
405 static void
406 lzms_init_offset_slot_tabs(struct lzms_compressor *c)
407 {
408         u32 offset;
409         unsigned slot = 0;
410
411         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
412         for (offset = 1; offset < 0xe4a5; offset++) {
413                 if (offset >= lzms_offset_slot_base[slot + 1])
414                         slot++;
415                 c->offset_slot_tab_1[offset] = slot;
416         }
417
418         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
419         for (; offset < 0x3de4a5; offset += (u32)1 << 11) {
420                 if (offset >= lzms_offset_slot_base[slot + 1])
421                         slot++;
422                 c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot;
423         }
424
425         /* slots [427, 799); 16 <= num_extra_bits  */
426         for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) {
427                 if (offset >= lzms_offset_slot_base[slot + 1])
428                         slot++;
429                 c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot;
430         }
431 }
432
433 /*
434  * Return the length slot for the specified match length, using the compressor's
435  * acceleration table if the length is small enough.
436  */
437 static inline unsigned
438 lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
439 {
440         if (likely(length <= MAX_FAST_LENGTH))
441                 return c->fast_length_slot_tab[length];
442         return lzms_get_length_slot(length);
443 }
444
445 /*
446  * Return the offset slot for the specified match offset, using the compressor's
447  * acceleration tables to speed up the mapping.
448  */
449 static inline unsigned
450 lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
451 {
452         if (offset < 0xe4a5)
453                 return c->offset_slot_tab_1[offset];
454         offset -= 0xe4a5;
455         if (offset < 0x3d0000)
456                 return c->offset_slot_tab_2[offset >> 11];
457         return c->offset_slot_tab_3[offset >> 16];
458 }
459
460 /******************************************************************************
461  *                             Range encoding                                 *
462  ******************************************************************************/
463
464 /*
465  * Initialize the range encoder @rc to write forwards to the specified buffer
466  * @out that is @count 16-bit integers long.
467  */
468 static void
469 lzms_range_encoder_init(struct lzms_range_encoder *rc, le16 *out, size_t count)
470 {
471         rc->lower_bound = 0;
472         rc->range_size = 0xffffffff;
473         rc->cache = 0;
474         rc->cache_size = 1;
475         rc->begin = out;
476         rc->next = out - 1;
477         rc->end = out + count;
478 }
479
480 /*
481  * Attempt to flush bits from the range encoder.
482  *
483  * The basic idea is that we're writing bits from @rc->lower_bound to the
484  * output.  However, due to carrying, the writing of coding units with the
485  * maximum value, as well as one prior coding unit, must be delayed until it is
486  * determined whether a carry is needed.
487  *
488  * This is based on the public domain code for LZMA written by Igor Pavlov, but
489  * with the following differences:
490  *
491  *      - In LZMS, 16-bit coding units are required rather than 8-bit.
492  *
493  *      - In LZMS, the first coding unit is not ignored by the decompressor, so
494  *        the encoder cannot output a dummy value to that position.
495  */
496 static void
497 lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
498 {
499         if ((u32)(rc->lower_bound) < 0xffff0000 ||
500             (u32)(rc->lower_bound >> 32) != 0)
501         {
502                 /* Carry not needed (rc->lower_bound < 0xffff0000), or carry
503                  * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit
504                  * is 1).  */
505                 do {
506                         if (likely(rc->next >= rc->begin)) {
507                                 if (rc->next != rc->end) {
508                                         put_unaligned_u16_le(rc->cache +
509                                                              (u16)(rc->lower_bound >> 32),
510                                                              rc->next++);
511                                 }
512                         } else {
513                                 rc->next++;
514                         }
515                         rc->cache = 0xffff;
516                 } while (--rc->cache_size != 0);
517
518                 rc->cache = (rc->lower_bound >> 16) & 0xffff;
519         }
520         ++rc->cache_size;
521         rc->lower_bound = (rc->lower_bound & 0xffff) << 16;
522 }
523
524 static bool
525 lzms_range_encoder_flush(struct lzms_range_encoder *rc)
526 {
527         for (int i = 0; i < 4; i++)
528                 lzms_range_encoder_shift_low(rc);
529         return rc->next != rc->end;
530 }
531
532 /*
533  * Encode the next bit using the range encoder.
534  *
535  * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
536  * bit is 0 rather than 1.
537  */
538 static inline void
539 lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
540 {
541         /* Normalize if needed.  */
542         if (rc->range_size <= 0xffff) {
543                 rc->range_size <<= 16;
544                 lzms_range_encoder_shift_low(rc);
545         }
546
547         u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob;
548         if (bit == 0) {
549                 rc->range_size = bound;
550         } else {
551                 rc->lower_bound += bound;
552                 rc->range_size -= bound;
553         }
554 }
555
556 /*
557  * Encode a bit.  This wraps around lzms_range_encode_bit() to handle using and
558  * updating the state and its corresponding probability entry.
559  */
560 static inline void
561 lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
562                 struct lzms_probability_entry *probs,
563                 struct lzms_range_encoder *rc)
564 {
565         struct lzms_probability_entry *prob_entry;
566         u32 prob;
567
568         /* Load the probability entry for the current state.  */
569         prob_entry = &probs[*state_p];
570
571         /* Update the state based on the next bit.  */
572         *state_p = ((*state_p << 1) | bit) & (num_states - 1);
573
574         /* Get the probability that the bit is 0.  */
575         prob = lzms_get_probability(prob_entry);
576
577         /* Update the probability entry.  */
578         lzms_update_probability_entry(prob_entry, bit);
579
580         /* Encode the bit using the range encoder.  */
581         lzms_range_encode_bit(rc, bit, prob);
582 }
583
584 /* Helper functions for encoding bits in the various decision classes  */
585
586 static void
587 lzms_encode_main_bit(struct lzms_compressor *c, int bit)
588 {
589         lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
590                         c->main_probs, &c->rc);
591 }
592
593 static void
594 lzms_encode_match_bit(struct lzms_compressor *c, int bit)
595 {
596         lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
597                         c->match_probs, &c->rc);
598 }
599
600 static void
601 lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
602 {
603         lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
604                         c->lz_probs, &c->rc);
605 }
606
607 static void
608 lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
609 {
610         lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
611                         c->lz_rep_probs[idx], &c->rc);
612 }
613
614 static void
615 lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
616 {
617         lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
618                         c->delta_probs, &c->rc);
619 }
620
621 static void
622 lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
623 {
624         lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
625                         c->delta_rep_probs[idx], &c->rc);
626 }
627
628 /******************************************************************************
629  *                   Huffman encoding and verbatim bits                       *
630  ******************************************************************************/
631
632 /*
633  * Initialize the output bitstream @os to write backwards to the specified
634  * buffer @out that is @count 16-bit integers long.
635  */
636 static void
637 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
638                            le16 *out, size_t count)
639 {
640         os->bitbuf = 0;
641         os->bitcount = 0;
642         os->next = out + count;
643         os->begin = out;
644 }
645
646 /*
647  * Write some bits, contained in the low-order @num_bits bits of @bits, to the
648  * output bitstream @os.
649  *
650  * @max_num_bits is a compile-time constant that specifies the maximum number of
651  * bits that can ever be written at this call site.
652  */
653 static inline void
654 lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
655                 const unsigned num_bits, const unsigned max_num_bits)
656 {
657         /* Add the bits to the bit buffer variable.  */
658         os->bitcount += num_bits;
659         os->bitbuf = (os->bitbuf << num_bits) | bits;
660
661         /* Check whether any coding units need to be written.  */
662         while (os->bitcount >= 16) {
663
664                 os->bitcount -= 16;
665
666                 /* Write a coding unit, unless it would underflow the buffer. */
667                 if (os->next != os->begin)
668                         put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next);
669
670                 /* Optimization for call sites that never write more than 16
671                  * bits at once.  */
672                 if (max_num_bits <= 16)
673                         break;
674         }
675 }
676
677 /*
678  * Flush the output bitstream, ensuring that all bits written to it have been
679  * written to memory.  Return %true if all bits have been output successfully,
680  * or %false if an overrun occurred.
681  */
682 static bool
683 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
684 {
685         if (os->next == os->begin)
686                 return false;
687
688         if (os->bitcount != 0)
689                 put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next);
690
691         return true;
692 }
693
694 static void
695 lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
696 {
697         make_canonical_huffman_code(rebuild_info->num_syms,
698                                     LZMS_MAX_CODEWORD_LENGTH,
699                                     rebuild_info->freqs,
700                                     rebuild_info->lens,
701                                     rebuild_info->codewords);
702         rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq;
703 }
704
705 static void
706 lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info,
707                        unsigned num_syms, unsigned rebuild_freq,
708                        u32 *codewords, u8 *lens, u32 *freqs)
709 {
710         rebuild_info->num_syms = num_syms;
711         rebuild_info->rebuild_freq = rebuild_freq;
712         rebuild_info->codewords = codewords;
713         rebuild_info->lens = lens;
714         rebuild_info->freqs = freqs;
715         lzms_init_symbol_frequencies(freqs, num_syms);
716         lzms_build_huffman_code(rebuild_info);
717 }
718
719 static void
720 lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
721 {
722         lzms_build_huffman_code(rebuild_info);
723         lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms);
724 }
725
726 /*
727  * Encode a symbol using the specified Huffman code.  Then, if the Huffman code
728  * needs to be rebuilt, rebuild it and return true; otherwise return false.
729  */
730 static inline bool
731 lzms_huffman_encode_symbol(unsigned sym,
732                            const u32 *codewords, const u8 *lens, u32 *freqs,
733                            struct lzms_output_bitstream *os,
734                            struct lzms_huffman_rebuild_info *rebuild_info)
735 {
736         lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH);
737         ++freqs[sym];
738         if (--rebuild_info->num_syms_until_rebuild == 0) {
739                 lzms_rebuild_huffman_code(rebuild_info);
740                 return true;
741         }
742         return false;
743 }
744
745 /* Helper routines to encode symbols using the various Huffman codes  */
746
747 static bool
748 lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym)
749 {
750         return lzms_huffman_encode_symbol(sym, c->literal_codewords,
751                                           c->literal_lens, c->literal_freqs,
752                                           &c->os, &c->literal_rebuild_info);
753 }
754
755 static bool
756 lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym)
757 {
758         return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords,
759                                           c->lz_offset_lens, c->lz_offset_freqs,
760                                           &c->os, &c->lz_offset_rebuild_info);
761 }
762
763 static bool
764 lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym)
765 {
766         return lzms_huffman_encode_symbol(sym, c->length_codewords,
767                                           c->length_lens, c->length_freqs,
768                                           &c->os, &c->length_rebuild_info);
769 }
770
771 static bool
772 lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym)
773 {
774         return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords,
775                                           c->delta_offset_lens, c->delta_offset_freqs,
776                                           &c->os, &c->delta_offset_rebuild_info);
777 }
778
779 static bool
780 lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym)
781 {
782         return lzms_huffman_encode_symbol(sym, c->delta_power_codewords,
783                                           c->delta_power_lens, c->delta_power_freqs,
784                                           &c->os, &c->delta_power_rebuild_info);
785 }
786
787 static void
788 lzms_update_fast_length_costs(struct lzms_compressor *c);
789
790 /*
791  * Encode a match length.  If this causes the Huffman code for length symbols to
792  * be rebuilt, also update the length costs array used by the parser.
793  */
794 static void
795 lzms_encode_length(struct lzms_compressor *c, u32 length)
796 {
797         unsigned slot = lzms_comp_get_length_slot(c, length);
798
799         if (lzms_encode_length_symbol(c, slot))
800                 lzms_update_fast_length_costs(c);
801
802         lzms_write_bits(&c->os, length - lzms_length_slot_base[slot],
803                         lzms_extra_length_bits[slot],
804                         LZMS_MAX_EXTRA_LENGTH_BITS);
805 }
806
807 /* Encode the offset of an LZ match.  */
808 static void
809 lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset)
810 {
811         unsigned slot = lzms_comp_get_offset_slot(c, offset);
812
813         lzms_encode_lz_offset_symbol(c, slot);
814         lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot],
815                         lzms_extra_offset_bits[slot],
816                         LZMS_MAX_EXTRA_OFFSET_BITS);
817 }
818
819 /* Encode the raw offset of a delta match.  */
820 static void
821 lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset)
822 {
823         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
824
825         lzms_encode_delta_offset_symbol(c, slot);
826         lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot],
827                         lzms_extra_offset_bits[slot],
828                         LZMS_MAX_EXTRA_OFFSET_BITS);
829 }
830
831 /******************************************************************************
832  *                             Item encoding                                  *
833  ******************************************************************************/
834
835 /* Encode the specified item, which may be a literal or any type of match.  */
836 static void
837 lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source)
838 {
839         /* Main bit: 0 = literal, 1 = match  */
840         int main_bit = (length > 1);
841         lzms_encode_main_bit(c, main_bit);
842
843         if (!main_bit) {
844                 /* Literal  */
845                 unsigned literal = source;
846                 lzms_encode_literal_symbol(c, literal);
847         } else {
848                 /* Match  */
849
850                 /* Match bit: 0 = LZ match, 1 = delta match  */
851                 int match_bit = (source & DELTA_SOURCE_TAG) != 0;
852                 lzms_encode_match_bit(c, match_bit);
853
854                 if (!match_bit) {
855                         /* LZ match  */
856
857                         /* LZ bit: 0 = explicit offset, 1 = repeat offset  */
858                         int lz_bit = (source < LZMS_NUM_LZ_REPS);
859                         lzms_encode_lz_bit(c, lz_bit);
860
861                         if (!lz_bit) {
862                                 /* Explicit offset LZ match  */
863                                 u32 offset = source - (LZMS_NUM_LZ_REPS - 1);
864                                 lzms_encode_lz_offset(c, offset);
865                         } else {
866                                 /* Repeat offset LZ match  */
867                                 int rep_idx = source;
868                                 for (int i = 0; i < rep_idx; i++)
869                                         lzms_encode_lz_rep_bit(c, 1, i);
870                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
871                                         lzms_encode_lz_rep_bit(c, 0, rep_idx);
872                         }
873                 } else {
874                         /* Delta match  */
875
876                         source &= ~DELTA_SOURCE_TAG;
877
878                         /* Delta bit: 0 = explicit offset, 1 = repeat offset  */
879                         int delta_bit = (source < LZMS_NUM_DELTA_REPS);
880                         lzms_encode_delta_bit(c, delta_bit);
881
882                         if (!delta_bit) {
883                                 /* Explicit offset delta match  */
884                                 u32 power = source >> DELTA_SOURCE_POWER_SHIFT;
885                                 u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) -
886                                                  (LZMS_NUM_DELTA_REPS - 1);
887                                 lzms_encode_delta_power_symbol(c, power);
888                                 lzms_encode_delta_raw_offset(c, raw_offset);
889                         } else {
890                                 /* Repeat offset delta match  */
891                                 int rep_idx = source;
892                                 for (int i = 0; i < rep_idx; i++)
893                                         lzms_encode_delta_rep_bit(c, 1, i);
894                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
895                                         lzms_encode_delta_rep_bit(c, 0, rep_idx);
896                         }
897                 }
898
899                 /* Match length (encoded the same way for any match type)  */
900                 lzms_encode_length(c, length);
901         }
902 }
903
904 /* Encode a list of matches and literals chosen by the parsing algorithm.  */
905 static void
906 lzms_encode_nonempty_item_list(struct lzms_compressor *c,
907                                struct lzms_optimum_node *end_node)
908 {
909         /* Since we've stored at each node the item we took to arrive at that
910          * node, we can trace our chosen path in backwards order.  However, for
911          * encoding we need to trace our chosen path in forwards order.  To make
912          * this possible, the following loop moves the items from their
913          * destination nodes to their source nodes, which effectively reverses
914          * the path.  (Think of it like reversing a singly-linked list.)  */
915         struct lzms_optimum_node *cur_node = end_node;
916         struct lzms_item saved_item = cur_node->item;
917         do {
918                 struct lzms_item item = saved_item;
919                 if (cur_node->num_extra_items > 0) {
920                         /* Handle an arrival via multi-item lookahead.  */
921                         unsigned i = 0;
922                         struct lzms_optimum_node *orig_node = cur_node;
923                         do {
924                                 cur_node -= item.length;
925                                 cur_node->item = item;
926                                 item = orig_node->extra_items[i];
927                         } while (++i != orig_node->num_extra_items);
928                 }
929                 cur_node -= item.length;
930                 saved_item = cur_node->item;
931                 cur_node->item = item;
932         } while (cur_node != c->optimum_nodes);
933
934         /* Now trace the chosen path in forwards order, encoding each item.  */
935         do {
936                 lzms_encode_item(c, cur_node->item.length, cur_node->item.source);
937                 cur_node += cur_node->item.length;
938         } while (cur_node != end_node);
939 }
940
941 static inline void
942 lzms_encode_item_list(struct lzms_compressor *c,
943                       struct lzms_optimum_node *end_node)
944 {
945         if (end_node != c->optimum_nodes)
946                 lzms_encode_nonempty_item_list(c, end_node);
947 }
948
949 /******************************************************************************
950  *                             Cost evaluation                                *
951  ******************************************************************************/
952
953 /*
954  * If p is the predicted probability of the next bit being a 0, then the number
955  * of bits required to encode a 0 bit using a binary range encoder is the real
956  * number -log2(p), and the number of bits required to encode a 1 bit is the
957  * real number -log2(1 - p).  To avoid computing either of these expressions at
958  * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from
959  * probability to cost for each possible probability.  Specifically, the array
960  * indices are the numerators of the possible probabilities in LZMS, where the
961  * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the
962  * bit costs multiplied by 1<<COST_SHIFT and rounded to the nearest integer.
963  * Furthermore, the values stored for 0% and 100% probabilities are equal to the
964  * adjacent values, since these probabilities are not actually permitted.  This
965  * allows us to use the num_recent_zero_bits value from the
966  * lzms_probability_entry as the array index without fixing up these two special
967  * cases.
968  */
969 static const u32 lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR + 1] = {
970         384, 384, 320, 283, 256, 235, 219, 204,
971         192, 181, 171, 163, 155, 147, 140, 134,
972         128, 122, 117, 112, 107, 103, 99,  94,
973         91,  87,  83,  80,  76,  73,  70,  67,
974         64,  61,  58,  56,  53,  51,  48,  46,
975         43,  41,  39,  37,  35,  33,  30,  29,
976         27,  25,  23,  21,  19,  17,  16,  14,
977         12,  11,  9,   8,   6,   4,   3,   1,
978         1
979 };
980
981 static inline void
982 check_cost_shift(void)
983 {
984         /* lzms_bit_costs is hard-coded to the current COST_SHIFT.  */
985         BUILD_BUG_ON(COST_SHIFT != 6);
986 }
987
988 #if 0
989 #include <math.h>
990
991 static void
992 lzms_compute_bit_costs(void)
993 {
994         for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) {
995                 u32 prob = i;
996                 if (prob == 0)
997                         prob++;
998                 else if (prob == LZMS_PROBABILITY_DENOMINATOR)
999                         prob--;
1000
1001                 lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) *
1002                                           (1 << COST_SHIFT));
1003         }
1004 }
1005 #endif
1006
1007 /* Return the cost to encode a 0 bit in the specified context.  */
1008 static inline u32
1009 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
1010 {
1011         return lzms_bit_costs[probs[state].num_recent_zero_bits];
1012 }
1013
1014 /* Return the cost to encode a 1 bit in the specified context.  */
1015 static inline u32
1016 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
1017 {
1018         return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
1019                               probs[state].num_recent_zero_bits];
1020 }
1021
1022 /* Return the cost to encode a literal, including the main bit.  */
1023 static inline u32
1024 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
1025 {
1026         return lzms_bit_0_cost(main_state, c->main_probs) +
1027                 ((u32)c->literal_lens[literal] << COST_SHIFT);
1028 }
1029
1030 /* Update 'fast_length_cost_tab' to use the latest Huffman code.  */
1031 static void
1032 lzms_update_fast_length_costs(struct lzms_compressor *c)
1033 {
1034         int slot = -1;
1035         u32 cost = 0;
1036         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
1037                 if (len >= lzms_length_slot_base[slot + 1]) {
1038                         slot++;
1039                         cost = (u32)(c->length_lens[slot] +
1040                                      lzms_extra_length_bits[slot]) << COST_SHIFT;
1041                 }
1042                 c->fast_length_cost_tab[len] = cost;
1043         }
1044 }
1045
1046 /* Return the cost to encode the specified match length, which must not exceed
1047  * MAX_FAST_LENGTH.  */
1048 static inline u32
1049 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
1050 {
1051         return c->fast_length_cost_tab[length];
1052 }
1053
1054 /* Return the cost to encode the specified LZ match offset.  */
1055 static inline u32
1056 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
1057 {
1058         unsigned slot = lzms_comp_get_offset_slot(c, offset);
1059         u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot];
1060         return num_bits << COST_SHIFT;
1061 }
1062
1063 /* Return the cost to encode the specified delta power and raw offset.  */
1064 static inline u32
1065 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
1066 {
1067         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
1068         u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] +
1069                        lzms_extra_offset_bits[slot];
1070         return num_bits << COST_SHIFT;
1071 }
1072
1073 /******************************************************************************
1074  *                              Adaptive state                                *
1075  ******************************************************************************/
1076
1077 static void
1078 lzms_init_adaptive_state(struct lzms_adaptive_state *state)
1079 {
1080         for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++)
1081                 state->recent_lz_offsets[i] = i + 1;
1082         state->prev_lz_offset = 0;
1083         state->upcoming_lz_offset = 0;
1084
1085         for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++)
1086                 state->recent_delta_pairs[i] = i + 1;
1087         state->prev_delta_pair = 0;
1088         state->upcoming_delta_pair = 0;
1089
1090         state->main_state = 0;
1091         state->match_state = 0;
1092         state->lz_state = 0;
1093         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
1094                 state->lz_rep_states[i] = 0;
1095         state->delta_state = 0;
1096         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
1097                 state->delta_rep_states[i] = 0;
1098 }
1099
1100 /*
1101  * Update the LRU queues for match sources when advancing by one item.
1102  *
1103  * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more
1104  * complex because:
1105  *      - there are separate queues for LZ and delta matches
1106  *      - updates to the queues are delayed by one encoded item (this prevents
1107  *        sources from being bumped up to index 0 too early)
1108  */
1109 static void
1110 lzms_update_lru_queues(struct lzms_adaptive_state *state)
1111 {
1112         if (state->prev_lz_offset != 0) {
1113                 for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--)
1114                         state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i];
1115                 state->recent_lz_offsets[0] = state->prev_lz_offset;
1116         }
1117         state->prev_lz_offset = state->upcoming_lz_offset;
1118
1119         if (state->prev_delta_pair != 0) {
1120                 for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--)
1121                         state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i];
1122                 state->recent_delta_pairs[0] = state->prev_delta_pair;
1123         }
1124         state->prev_delta_pair = state->upcoming_delta_pair;
1125 }
1126
1127 static inline void
1128 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
1129 {
1130         *state_p = ((*state_p << 1) | bit) % num_states;
1131 }
1132
1133 static inline void
1134 lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
1135 {
1136         lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
1137 }
1138
1139 static inline void
1140 lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
1141 {
1142         lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
1143 }
1144
1145 static inline void
1146 lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
1147 {
1148         lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
1149 }
1150
1151 static inline void
1152 lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1153 {
1154         for (int i = 0; i < rep_idx; i++)
1155                 lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS);
1156
1157         if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1158                 lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
1159 }
1160
1161 static inline void
1162 lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
1163 {
1164         lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
1165 }
1166
1167 static inline void
1168 lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1169 {
1170         for (int i = 0; i < rep_idx; i++)
1171                 lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS);
1172
1173         if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1174                 lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS);
1175 }
1176
1177 /******************************************************************************
1178  *                              Matchfinding                                  *
1179  ******************************************************************************/
1180
1181 /* Note: this code just handles finding delta matches.  The code for finding LZ
1182  * matches is elsewhere.  */
1183
1184
1185 /* Initialize the delta matchfinder for a new input buffer.  */
1186 static void
1187 lzms_init_delta_matchfinder(struct lzms_compressor *c)
1188 {
1189         /* Set all entries to use an invalid power, which will never match.  */
1190         BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER >= (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
1191         memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
1192
1193         /* Initialize the next hash code for each power.  We can just use zeroes
1194          * initially; it doesn't really matter.  */
1195         for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++)
1196                 c->next_delta_hashes[i] = 0;
1197 }
1198
1199 /*
1200  * Compute a DELTA_HASH_ORDER-bit hash code for the first
1201  * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
1202  * delta context with the specified @span.
1203  */
1204 static inline u32
1205 lzms_delta_hash(const u8 *p, u32 span)
1206 {
1207         /* A delta match has a certain span and an offset that is a multiple of
1208          * that span.  To reduce wasted space we use a single combined hash
1209          * table for all spans and positions, but to minimize collisions we
1210          * include in the hash code computation the span and the low-order bits
1211          * of the current position.  */
1212
1213         BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3);
1214         u8 d0 = *(p + 0) - *(p + 0 - span);
1215         u8 d1 = *(p + 1) - *(p + 1 - span);
1216         u8 d2 = *(p + 2) - *(p + 2 - span);
1217         u32 v = ((span + ((u32)(uintptr_t)p & (span - 1))) << 24) |
1218                 ((u32)d2 << 16) | ((u32)d1 << 8) | d0;
1219         return lz_hash(v, DELTA_HASH_ORDER);
1220 }
1221
1222 /*
1223  * Given a match between @in_next and @matchptr in a delta context with the
1224  * specified @span and having the initial @len, extend the match as far as
1225  * possible, up to a limit of @max_len.
1226  */
1227 static inline u32
1228 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
1229                         u32 len, u32 max_len, u32 span)
1230 {
1231         while (len < max_len &&
1232                (u8)(*(in_next + len) - *(in_next + len - span)) ==
1233                (u8)(*(matchptr + len) - *(matchptr + len - span)))
1234         {
1235                 len++;
1236         }
1237         return len;
1238 }
1239
1240 static void
1241 lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
1242                                   const u8 *in_next, u32 count)
1243 {
1244         u32 pos = in_next - c->in_buffer;
1245         if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1))
1246                 return;
1247         do {
1248                 /* Update the hash table for each power.  */
1249                 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1250                         const u32 span = (u32)1 << power;
1251                         if (unlikely(pos < span))
1252                                 continue;
1253                         const u32 next_hash = lzms_delta_hash(in_next + 1, span);
1254                         const u32 hash = c->next_delta_hashes[power];
1255                         c->delta_hash_table[hash] =
1256                                 (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1257                         c->next_delta_hashes[power] = next_hash;
1258                         prefetch(&c->delta_hash_table[next_hash]);
1259                 }
1260         } while (in_next++, pos++, --count);
1261 }
1262
1263 /*
1264  * Skip the next @count bytes (don't search for matches at them).  @in_next
1265  * points to the first byte to skip.  The return value is @in_next + count.
1266  */
1267 static const u8 *
1268 lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
1269 {
1270         lcpit_matchfinder_skip_bytes(&c->mf, count);
1271         if (c->use_delta_matches)
1272                 lzms_delta_matchfinder_skip_bytes(c, in_next, count);
1273         return in_next + count;
1274 }
1275
1276 /******************************************************************************
1277  *                          "Near-optimal" parsing                            *
1278  ******************************************************************************/
1279
1280 /*
1281  * The main near-optimal parsing routine.
1282  *
1283  * Briefly, the algorithm does an approximate minimum-cost path search to find a
1284  * "near-optimal" sequence of matches and literals to output, based on the
1285  * current cost model.  The algorithm steps forward, position by position (byte
1286  * by byte), and updates the minimum cost path to reach each later position that
1287  * can be reached using a match or literal from the current position.  This is
1288  * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
1289  * the graph edges are possible matches/literals to code, and the cost of each
1290  * edge is the estimated number of bits that will be required to output the
1291  * corresponding match or literal.  But one difference is that we actually
1292  * compute the lowest-cost path in pieces, where each piece is terminated when
1293  * there are no choices to be made.
1294  *
1295  * The costs of literals and matches are estimated using the range encoder
1296  * states and the semi-adaptive Huffman codes.  Except for range encoding
1297  * states, costs are assumed to be constant throughout a single run of the
1298  * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data.  This
1299  * introduces a source of non-optimality because the probabilities and Huffman
1300  * codes can change over this part of the data.  And of course, there are
1301  * various other reasons why the result isn't optimal in terms of compression
1302  * ratio.
1303  */
1304 static void
1305 lzms_near_optimal_parse(struct lzms_compressor *c)
1306 {
1307         const u8 *in_next = c->in_buffer;
1308         const u8 * const in_end = &c->in_buffer[c->in_nbytes];
1309         struct lzms_optimum_node *cur_node;
1310         struct lzms_optimum_node *end_node;
1311
1312         /* Set initial length costs for lengths <= MAX_FAST_LENGTH.  */
1313         lzms_update_fast_length_costs(c);
1314
1315         /* Set up the initial adaptive state.  */
1316         lzms_init_adaptive_state(&c->optimum_nodes[0].state);
1317
1318 begin:
1319         /* Start building a new list of items, which will correspond to the next
1320          * piece of the overall minimum-cost path.  */
1321
1322         cur_node = c->optimum_nodes;
1323         cur_node->cost = 0;
1324         end_node = cur_node;
1325
1326         if (in_next == in_end)
1327                 return;
1328
1329         /* The following loop runs once for each per byte in the input buffer,
1330          * except in a few shortcut cases.  */
1331         for (;;) {
1332                 u32 num_matches;
1333
1334                 /* Repeat offset LZ matches  */
1335                 if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS &&
1336                            in_end - in_next >= 2))
1337                 {
1338                         for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) {
1339
1340                                 /* Looking for a repeat offset LZ match at queue
1341                                  * index @rep_idx  */
1342
1343                                 const u32 offset = cur_node->state.recent_lz_offsets[rep_idx];
1344                                 const u8 * const matchptr = in_next - offset;
1345
1346                                 /* Check the first 2 bytes before entering the extension loop.  */
1347                                 if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr))
1348                                         continue;
1349
1350                                 /* Extend the match to its full length.  */
1351                                 const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next);
1352
1353                                 /* Early out for long repeat offset LZ match */
1354                                 if (rep_len >= c->mf.nice_match_len) {
1355
1356                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1357
1358                                         lzms_encode_item_list(c, cur_node);
1359                                         lzms_encode_item(c, rep_len, rep_idx);
1360
1361                                         c->optimum_nodes[0].state = cur_node->state;
1362                                         cur_node = &c->optimum_nodes[0];
1363
1364                                         cur_node->state.upcoming_lz_offset =
1365                                                 cur_node->state.recent_lz_offsets[rep_idx];
1366                                         cur_node->state.upcoming_delta_pair = 0;
1367                                         for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1368                                                 cur_node->state.recent_lz_offsets[i] =
1369                                                         cur_node->state.recent_lz_offsets[i + 1];
1370                                         lzms_update_lru_queues(&cur_node->state);
1371                                         lzms_update_main_state(&cur_node->state, 1);
1372                                         lzms_update_match_state(&cur_node->state, 0);
1373                                         lzms_update_lz_state(&cur_node->state, 1);
1374                                         lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1375                                         goto begin;
1376                                 }
1377
1378                                 while (end_node < cur_node + rep_len)
1379                                         (++end_node)->cost = INFINITE_COST;
1380
1381                                 u32 base_cost = cur_node->cost +
1382                                                 lzms_bit_1_cost(cur_node->state.main_state,
1383                                                                 c->main_probs) +
1384                                                 lzms_bit_0_cost(cur_node->state.match_state,
1385                                                                 c->match_probs) +
1386                                                 lzms_bit_1_cost(cur_node->state.lz_state,
1387                                                                 c->lz_probs);
1388
1389                                 for (int i = 0; i < rep_idx; i++)
1390                                         base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
1391                                                                      c->lz_rep_probs[i]);
1392
1393                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1394                                         base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
1395                                                                      c->lz_rep_probs[rep_idx]);
1396
1397                                 u32 len = 2;
1398                                 do {
1399                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1400                                         if (cost < (cur_node + len)->cost) {
1401                                                 (cur_node + len)->cost = cost;
1402                                                 (cur_node + len)->item = (struct lzms_item) {
1403                                                         .length = len,
1404                                                         .source = rep_idx,
1405                                                 };
1406                                                 (cur_node + len)->num_extra_items = 0;
1407                                         }
1408                                 } while (++len <= rep_len);
1409
1410
1411                                 /* try LZ-rep + lit + LZ-rep0  */
1412                                 if (c->try_lzrep_lit_lzrep0 &&
1413                                     in_end - (in_next + rep_len) >= 3 &&
1414                                     load_u16_unaligned(in_next + rep_len + 1) ==
1415                                     load_u16_unaligned(matchptr + rep_len + 1))
1416                                 {
1417                                         const u32 rep0_len = lz_extend(in_next + rep_len + 1,
1418                                                                        matchptr + rep_len + 1,
1419                                                                        2,
1420                                                                        min(c->mf.nice_match_len,
1421                                                                            in_end - (in_next + rep_len + 1)));
1422
1423                                         unsigned main_state = cur_node->state.main_state;
1424                                         unsigned match_state = cur_node->state.match_state;
1425                                         unsigned lz_state = cur_node->state.lz_state;
1426                                         unsigned lz_rep0_state = cur_node->state.lz_rep_states[0];
1427
1428                                         /* update states for LZ-rep  */
1429                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1430                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1431                                         lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS;
1432                                         lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) %
1433                                                                 LZMS_NUM_LZ_REP_PROBS;
1434
1435                                         /* LZ-rep cost  */
1436                                         u32 cost = base_cost + lzms_fast_length_cost(c, rep_len);
1437
1438                                         /* add literal cost  */
1439                                         cost += lzms_literal_cost(c, main_state, *(in_next + rep_len));
1440
1441                                         /* update state for literal  */
1442                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1443
1444                                         /* add LZ-rep0 cost  */
1445                                         cost += lzms_bit_1_cost(main_state, c->main_probs) +
1446                                                 lzms_bit_0_cost(match_state, c->match_probs) +
1447                                                 lzms_bit_1_cost(lz_state, c->lz_probs) +
1448                                                 lzms_bit_0_cost(lz_rep0_state, c->lz_rep_probs[0]) +
1449                                                 lzms_fast_length_cost(c, rep0_len);
1450
1451                                         const u32 total_len = rep_len + 1 + rep0_len;
1452
1453                                         while (end_node < cur_node + total_len)
1454                                                 (++end_node)->cost = INFINITE_COST;
1455
1456                                         if (cost < (cur_node + total_len)->cost) {
1457                                                 (cur_node + total_len)->cost = cost;
1458                                                 (cur_node + total_len)->item = (struct lzms_item) {
1459                                                         .length = rep0_len,
1460                                                         .source = 0,
1461                                                 };
1462                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1463                                                         .length = 1,
1464                                                         .source = *(in_next + rep_len),
1465                                                 };
1466                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1467                                                         .length = rep_len,
1468                                                         .source = rep_idx,
1469                                                 };
1470                                                 (cur_node + total_len)->num_extra_items = 2;
1471                                         }
1472                                 }
1473                         }
1474                 }
1475
1476                 /* Repeat offset delta matches  */
1477                 if (c->use_delta_matches &&
1478                     likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 &&
1479                            (in_end - in_next >= 2)))
1480                 {
1481                         for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) {
1482
1483                                 /* Looking for a repeat offset delta match at
1484                                  * queue index @rep_idx  */
1485
1486                                 const u32 pair = cur_node->state.recent_delta_pairs[rep_idx];
1487                                 const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT;
1488                                 const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK;
1489                                 const u32 span = (u32)1 << power;
1490                                 const u32 offset = raw_offset << power;
1491                                 const u8 * const matchptr = in_next - offset;
1492
1493                                 /* Check the first 2 bytes before entering the
1494                                  * extension loop.  */
1495                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1496                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1497                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1498                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))))
1499                                         continue;
1500
1501                                 /* Extend the match to its full length.  */
1502                                 const u32 rep_len = lzms_extend_delta_match(in_next, matchptr,
1503                                                                             2, in_end - in_next,
1504                                                                             span);
1505
1506                                 /* Early out for long repeat offset delta match */
1507                                 if (rep_len >= c->mf.nice_match_len) {
1508
1509                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1510
1511                                         lzms_encode_item_list(c, cur_node);
1512                                         lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx);
1513
1514                                         c->optimum_nodes[0].state = cur_node->state;
1515                                         cur_node = &c->optimum_nodes[0];
1516
1517                                         cur_node->state.upcoming_delta_pair = pair;
1518                                         cur_node->state.upcoming_lz_offset = 0;
1519                                         for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1520                                                 cur_node->state.recent_delta_pairs[i] =
1521                                                         cur_node->state.recent_delta_pairs[i + 1];
1522                                         lzms_update_lru_queues(&cur_node->state);
1523                                         lzms_update_main_state(&cur_node->state, 1);
1524                                         lzms_update_match_state(&cur_node->state, 1);
1525                                         lzms_update_delta_state(&cur_node->state, 1);
1526                                         lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1527                                         goto begin;
1528                                 }
1529
1530                                 while (end_node < cur_node + rep_len)
1531                                         (++end_node)->cost = INFINITE_COST;
1532
1533                                 u32 base_cost = cur_node->cost +
1534                                                 lzms_bit_1_cost(cur_node->state.main_state,
1535                                                                 c->main_probs) +
1536                                                 lzms_bit_1_cost(cur_node->state.match_state,
1537                                                                 c->match_probs) +
1538                                                 lzms_bit_1_cost(cur_node->state.delta_state,
1539                                                                 c->delta_probs);
1540
1541                                 for (int i = 0; i < rep_idx; i++)
1542                                         base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
1543                                                                      c->delta_rep_probs[i]);
1544
1545                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1546                                         base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
1547                                                                      c->delta_rep_probs[rep_idx]);
1548
1549                                 u32 len = 2;
1550                                 do {
1551                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1552                                         if (cost < (cur_node + len)->cost) {
1553                                                 (cur_node + len)->cost = cost;
1554                                                 (cur_node + len)->item = (struct lzms_item) {
1555                                                         .length = len,
1556                                                         .source = DELTA_SOURCE_TAG | rep_idx,
1557                                                 };
1558                                                 (cur_node + len)->num_extra_items = 0;
1559                                         }
1560                                 } while (++len <= rep_len);
1561                         }
1562                 }
1563
1564                 /* Explicit offset LZ matches  */
1565                 num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches);
1566                 if (num_matches) {
1567
1568                         u32 best_len = c->matches[0].length;
1569
1570                         /* Early out for long explicit offset LZ match  */
1571                         if (best_len >= c->mf.nice_match_len) {
1572
1573                                 const u32 offset = c->matches[0].offset;
1574
1575                                 /* Extend the match as far as possible.
1576                                  * This is necessary because the LCP-interval
1577                                  * tree matchfinder only reports up to
1578                                  * nice_match_len bytes.  */
1579                                 best_len = lz_extend(in_next, in_next - offset,
1580                                                      best_len, in_end - in_next);
1581
1582                                 in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1);
1583
1584                                 lzms_encode_item_list(c, cur_node);
1585                                 lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1);
1586
1587                                 c->optimum_nodes[0].state = cur_node->state;
1588                                 cur_node = &c->optimum_nodes[0];
1589
1590                                 cur_node->state.upcoming_lz_offset = offset;
1591                                 cur_node->state.upcoming_delta_pair = 0;
1592                                 lzms_update_lru_queues(&cur_node->state);
1593                                 lzms_update_main_state(&cur_node->state, 1);
1594                                 lzms_update_match_state(&cur_node->state, 0);
1595                                 lzms_update_lz_state(&cur_node->state, 0);
1596                                 goto begin;
1597                         }
1598
1599                         while (end_node < cur_node + best_len)
1600                                 (++end_node)->cost = INFINITE_COST;
1601
1602                         u32 base_cost = cur_node->cost +
1603                                         lzms_bit_1_cost(cur_node->state.main_state,
1604                                                         c->main_probs) +
1605                                         lzms_bit_0_cost(cur_node->state.match_state,
1606                                                         c->match_probs) +
1607                                         lzms_bit_0_cost(cur_node->state.lz_state,
1608                                                         c->lz_probs);
1609
1610                         if (c->try_lzmatch_lit_lzrep0 &&
1611                             likely(in_end - (in_next + c->matches[0].length) >= 3))
1612                         {
1613                                 /* try LZ-match + lit + LZ-rep0  */
1614
1615                                 u32 l = 2;
1616                                 u32 i = num_matches - 1;
1617                                 do {
1618                                         const u32 len = c->matches[i].length;
1619                                         const u32 offset = c->matches[i].offset;
1620                                         const u32 position_cost = base_cost +
1621                                                                   lzms_lz_offset_cost(c, offset);
1622                                         do {
1623                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1624                                                 if (cost < (cur_node + l)->cost) {
1625                                                         (cur_node + l)->cost = cost;
1626                                                         (cur_node + l)->item = (struct lzms_item) {
1627                                                                 .length = l,
1628                                                                 .source = offset + (LZMS_NUM_LZ_REPS - 1),
1629                                                         };
1630                                                         (cur_node + l)->num_extra_items = 0;
1631                                                 }
1632                                         } while (++l <= len);
1633
1634                                         const u8 * const matchptr = in_next - offset;
1635                                         if (load_u16_unaligned(matchptr + len + 1) !=
1636                                             load_u16_unaligned(in_next + len + 1))
1637                                                 continue;
1638
1639                                         const u32 rep0_len = lz_extend(in_next + len + 1,
1640                                                                        matchptr + len + 1,
1641                                                                        2,
1642                                                                        min(c->mf.nice_match_len,
1643                                                                            in_end - (in_next + len + 1)));
1644
1645                                         unsigned main_state = cur_node->state.main_state;
1646                                         unsigned match_state = cur_node->state.match_state;
1647                                         unsigned lz_state = cur_node->state.lz_state;
1648
1649                                         /* update states for LZ-match  */
1650                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1651                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1652                                         lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS;
1653
1654                                         /* LZ-match cost  */
1655                                         u32 cost = position_cost + lzms_fast_length_cost(c, len);
1656
1657                                         /* add literal cost  */
1658                                         cost += lzms_literal_cost(c, main_state, *(in_next + len));
1659
1660                                         /* update state for literal  */
1661                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1662
1663                                         /* add LZ-rep0 cost  */
1664                                         cost += lzms_bit_1_cost(main_state, c->main_probs) +
1665                                                 lzms_bit_0_cost(match_state, c->match_probs) +
1666                                                 lzms_bit_1_cost(lz_state, c->lz_probs) +
1667                                                 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1668                                                                 c->lz_rep_probs[0]) +
1669                                                 lzms_fast_length_cost(c, rep0_len);
1670
1671                                         const u32 total_len = len + 1 + rep0_len;
1672
1673                                         while (end_node < cur_node + total_len)
1674                                                 (++end_node)->cost = INFINITE_COST;
1675
1676                                         if (cost < (cur_node + total_len)->cost) {
1677                                                 (cur_node + total_len)->cost = cost;
1678                                                 (cur_node + total_len)->item = (struct lzms_item) {
1679                                                         .length = rep0_len,
1680                                                         .source = 0,
1681                                                 };
1682                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1683                                                         .length = 1,
1684                                                         .source = *(in_next + len),
1685                                                 };
1686                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1687                                                         .length = len,
1688                                                         .source = offset + LZMS_NUM_LZ_REPS - 1,
1689                                                 };
1690                                                 (cur_node + total_len)->num_extra_items = 2;
1691                                         }
1692                                 } while (i--);
1693                         } else {
1694                                 u32 l = 2;
1695                                 u32 i = num_matches - 1;
1696                                 do {
1697                                         u32 position_cost = base_cost +
1698                                                             lzms_lz_offset_cost(c, c->matches[i].offset);
1699                                         do {
1700                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1701                                                 if (cost < (cur_node + l)->cost) {
1702                                                         (cur_node + l)->cost = cost;
1703                                                         (cur_node + l)->item = (struct lzms_item) {
1704                                                                 .length = l,
1705                                                                 .source = c->matches[i].offset +
1706                                                                           (LZMS_NUM_LZ_REPS - 1),
1707                                                         };
1708                                                         (cur_node + l)->num_extra_items = 0;
1709                                                 }
1710                                         } while (++l <= c->matches[i].length);
1711                                 } while (i--);
1712                         }
1713                 }
1714
1715                 /* Explicit offset delta matches  */
1716                 if (c->use_delta_matches &&
1717                     likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1))
1718                 {
1719                         const u32 pos = in_next - c->in_buffer;
1720
1721                         /* Consider each possible power (log2 of span)  */
1722                         BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER > LZMS_NUM_DELTA_POWER_SYMS);
1723                         for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1724
1725                                 const u32 span = (u32)1 << power;
1726
1727                                 if (unlikely(pos < span))
1728                                         continue;
1729
1730                                 const u32 next_hash = lzms_delta_hash(in_next + 1, span);
1731                                 const u32 hash = c->next_delta_hashes[power];
1732                                 const u32 cur_match = c->delta_hash_table[hash];
1733
1734                                 c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1735                                 c->next_delta_hashes[power] = next_hash;
1736                                 prefetch(&c->delta_hash_table[next_hash]);
1737
1738                                 if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
1739                                         continue;
1740
1741                                 const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK);
1742
1743                                 /* The offset must be a multiple of span.  */
1744                                 if (offset & (span - 1))
1745                                         continue;
1746
1747                                 const u8 * const matchptr = in_next - offset;
1748
1749                                 /* Check the first 3 bytes before entering the
1750                                  * extension loop.  */
1751                                 BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3);
1752                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1753                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1754                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1755                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) ||
1756                                     ((u8)(*(in_next + 2) - *(in_next + 2 - span)) !=
1757                                      (u8)(*(matchptr + 2) - *(matchptr + 2 - span))))
1758                                         continue;
1759
1760                                 /* Extend the delta match to its full length.  */
1761                                 const u32 len = lzms_extend_delta_match(in_next,
1762                                                                         matchptr,
1763                                                                         3,
1764                                                                         in_end - in_next,
1765                                                                         span);
1766
1767                                 const u32 raw_offset = offset >> power;
1768
1769                                 if (unlikely(raw_offset > DELTA_SOURCE_RAW_OFFSET_MASK -
1770                                                           (LZMS_NUM_DELTA_REPS - 1)))
1771                                         continue;
1772
1773                                 const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
1774                                                  raw_offset;
1775                                 const u32 source = DELTA_SOURCE_TAG |
1776                                                    (pair + LZMS_NUM_DELTA_REPS - 1);
1777
1778                                 /* Early out for long explicit offset delta match  */
1779                                 if (len >= c->mf.nice_match_len) {
1780
1781                                         in_next = lzms_skip_bytes(c, len - 1, in_next + 1);
1782
1783                                         lzms_encode_item_list(c, cur_node);
1784                                         lzms_encode_item(c, len, source);
1785
1786                                         c->optimum_nodes[0].state = cur_node->state;
1787                                         cur_node = &c->optimum_nodes[0];
1788
1789                                         cur_node->state.upcoming_lz_offset = 0;
1790                                         cur_node->state.upcoming_delta_pair = pair;
1791                                         lzms_update_lru_queues(&cur_node->state);
1792                                         lzms_update_main_state(&cur_node->state, 1);
1793                                         lzms_update_match_state(&cur_node->state, 1);
1794                                         lzms_update_delta_state(&cur_node->state, 0);
1795                                         goto begin;
1796                                 }
1797
1798                                 while (end_node < cur_node + len)
1799                                         (++end_node)->cost = INFINITE_COST;
1800
1801                                 u32 base_cost = cur_node->cost +
1802                                                 lzms_bit_1_cost(cur_node->state.main_state,
1803                                                                 c->main_probs) +
1804                                                 lzms_bit_1_cost(cur_node->state.match_state,
1805                                                                 c->match_probs) +
1806                                                 lzms_bit_0_cost(cur_node->state.delta_state,
1807                                                                 c->delta_probs) +
1808                                                 lzms_delta_source_cost(c, power, raw_offset);
1809
1810                                 u32 l = NBYTES_HASHED_FOR_DELTA;
1811                                 do {
1812                                         u32 cost = base_cost + lzms_fast_length_cost(c, l);
1813                                         if (cost < (cur_node + l)->cost) {
1814                                                 (cur_node + l)->cost = cost;
1815                                                 (cur_node + l)->item = (struct lzms_item) {
1816                                                         .length = l,
1817                                                         .source = source,
1818                                                 };
1819                                                 (cur_node + l)->num_extra_items = 0;
1820                                         }
1821                                 } while (++l <= len);
1822                         }
1823                 }
1824
1825                 /* Literal  */
1826                 if (end_node < cur_node + 1)
1827                         (++end_node)->cost = INFINITE_COST;
1828                 const u32 cur_and_lit_cost = cur_node->cost +
1829                                              lzms_literal_cost(c, cur_node->state.main_state,
1830                                                                *in_next);
1831                 if (cur_and_lit_cost < (cur_node + 1)->cost) {
1832                         (cur_node + 1)->cost = cur_and_lit_cost;
1833                         (cur_node + 1)->item = (struct lzms_item) {
1834                                 .length = 1,
1835                                 .source = *in_next,
1836                         };
1837                         (cur_node + 1)->num_extra_items = 0;
1838                 } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) {
1839                         /* try lit + LZ-rep0  */
1840                         const u32 offset =
1841                                 (cur_node->state.prev_lz_offset) ?
1842                                         cur_node->state.prev_lz_offset :
1843                                         cur_node->state.recent_lz_offsets[0];
1844
1845                         if (load_u16_unaligned(in_next + 1) ==
1846                             load_u16_unaligned(in_next + 1 - offset))
1847                         {
1848                                 const u32 rep0_len = lz_extend(in_next + 1,
1849                                                                in_next + 1 - offset,
1850                                                                2,
1851                                                                min(in_end - (in_next + 1),
1852                                                                    c->mf.nice_match_len));
1853
1854                                 unsigned main_state = cur_node->state.main_state;
1855
1856                                 /* Update main_state after literal  */
1857                                 main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS;
1858
1859                                 /* Add cost of LZ-rep0  */
1860                                 const u32 cost = cur_and_lit_cost +
1861                                                  lzms_bit_1_cost(main_state, c->main_probs) +
1862                                                  lzms_bit_0_cost(cur_node->state.match_state,
1863                                                                  c->match_probs) +
1864                                                  lzms_bit_1_cost(cur_node->state.lz_state,
1865                                                                  c->lz_probs) +
1866                                                  lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1867                                                                  c->lz_rep_probs[0]) +
1868                                                  lzms_fast_length_cost(c, rep0_len);
1869
1870                                 const u32 total_len = 1 + rep0_len;
1871
1872                                 while (end_node < cur_node + total_len)
1873                                         (++end_node)->cost = INFINITE_COST;
1874
1875                                 if (cost < (cur_node + total_len)->cost) {
1876                                         (cur_node + total_len)->cost = cost;
1877                                         (cur_node + total_len)->item = (struct lzms_item) {
1878                                                 .length = rep0_len,
1879                                                 .source = 0,
1880                                         };
1881                                         (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1882                                                 .length = 1,
1883                                                 .source = *in_next,
1884                                         };
1885                                         (cur_node + total_len)->num_extra_items = 1;
1886                                 }
1887                         }
1888                 }
1889
1890                 /* Advance to the next position.  */
1891                 in_next++;
1892                 cur_node++;
1893
1894                 /* The lowest-cost path to the current position is now known.
1895                  * Finalize the adaptive state that results from taking this
1896                  * lowest-cost path.  */
1897                 struct lzms_item item_to_take = cur_node->item;
1898                 struct lzms_optimum_node *source_node = cur_node - (item_to_take.length);
1899                 int next_item_idx = -1;
1900                 for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
1901                         item_to_take = cur_node->extra_items[i];
1902                         source_node -= item_to_take.length;
1903                         next_item_idx++;
1904                 }
1905                 cur_node->state = source_node->state;
1906                 for (;;) {
1907                         const u32 length = item_to_take.length;
1908                         u32 source = item_to_take.source;
1909
1910                         cur_node->state.upcoming_lz_offset = 0;
1911                         cur_node->state.upcoming_delta_pair = 0;
1912                         if (length > 1) {
1913                                 /* Match  */
1914
1915                                 lzms_update_main_state(&cur_node->state, 1);
1916
1917                                 if (source & DELTA_SOURCE_TAG) {
1918                                         /* Delta match  */
1919
1920                                         lzms_update_match_state(&cur_node->state, 1);
1921                                         source &= ~DELTA_SOURCE_TAG;
1922
1923                                         if (source >= LZMS_NUM_DELTA_REPS) {
1924                                                 /* Explicit offset delta match  */
1925                                                 u32 pair = source - (LZMS_NUM_DELTA_REPS - 1);
1926                                                 lzms_update_delta_state(&cur_node->state, 0);
1927                                                 cur_node->state.upcoming_delta_pair = pair;
1928                                         } else {
1929                                                 /* Repeat offset delta match  */
1930                                                 int rep_idx = source;
1931
1932                                                 lzms_update_delta_state(&cur_node->state, 1);
1933                                                 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1934
1935                                                 cur_node->state.upcoming_delta_pair =
1936                                                         cur_node->state.recent_delta_pairs[rep_idx];
1937
1938                                                 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1939                                                         cur_node->state.recent_delta_pairs[i] =
1940                                                                 cur_node->state.recent_delta_pairs[i + 1];
1941                                         }
1942                                 } else {
1943                                         lzms_update_match_state(&cur_node->state, 0);
1944
1945                                         if (source >= LZMS_NUM_LZ_REPS) {
1946                                                 /* Explicit offset LZ match  */
1947                                                 lzms_update_lz_state(&cur_node->state, 0);
1948                                                 cur_node->state.upcoming_lz_offset =
1949                                                         source - (LZMS_NUM_LZ_REPS - 1);
1950                                         } else {
1951                                                 /* Repeat offset LZ match  */
1952                                                 int rep_idx = source;
1953
1954                                                 lzms_update_lz_state(&cur_node->state, 1);
1955                                                 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1956
1957                                                 cur_node->state.upcoming_lz_offset =
1958                                                         cur_node->state.recent_lz_offsets[rep_idx];
1959
1960                                                 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1961                                                         cur_node->state.recent_lz_offsets[i] =
1962                                                                 cur_node->state.recent_lz_offsets[i + 1];
1963                                         }
1964                                 }
1965                         } else {
1966                                 /* Literal  */
1967                                 lzms_update_main_state(&cur_node->state, 0);
1968                         }
1969
1970                         lzms_update_lru_queues(&cur_node->state);
1971
1972                         if (next_item_idx < 0)
1973                                 break;
1974                         if (next_item_idx == 0)
1975                                 item_to_take = cur_node->item;
1976                         else
1977                                 item_to_take = cur_node->extra_items[next_item_idx - 1];
1978                         --next_item_idx;
1979                 }
1980
1981                 /*
1982                  * This loop will terminate when either of the following
1983                  * conditions is true:
1984                  *
1985                  * (1) cur_node == end_node
1986                  *
1987                  *      There are no paths that extend beyond the current
1988                  *      position.  In this case, any path to a later position
1989                  *      must pass through the current position, so we can go
1990                  *      ahead and choose the list of items that led to this
1991                  *      position.
1992                  *
1993                  * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]
1994                  *
1995                  *      This bounds the number of times the algorithm can step
1996                  *      forward before it is guaranteed to start choosing items.
1997                  *      This limits the memory usage.  It also guarantees that
1998                  *      the parser will not go too long without updating the
1999                  *      probability tables.
2000                  *
2001                  * Note: no check for end-of-buffer is needed because
2002                  * end-of-buffer will trigger condition (1).
2003                  */
2004                 if (cur_node == end_node ||
2005                     cur_node == &c->optimum_nodes[NUM_OPTIM_NODES])
2006                 {
2007                         lzms_encode_nonempty_item_list(c, cur_node);
2008                         c->optimum_nodes[0].state = cur_node->state;
2009                         goto begin;
2010                 }
2011         }
2012 }
2013
2014 static void
2015 lzms_init_states_and_probabilities(struct lzms_compressor *c)
2016 {
2017         c->main_state = 0;
2018         c->match_state = 0;
2019         c->lz_state = 0;
2020         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2021                 c->lz_rep_states[i] = 0;
2022         c->delta_state = 0;
2023         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2024                 c->delta_rep_states[i] = 0;
2025
2026         lzms_init_probability_entries(c->main_probs, LZMS_NUM_MAIN_PROBS);
2027         lzms_init_probability_entries(c->match_probs, LZMS_NUM_MATCH_PROBS);
2028         lzms_init_probability_entries(c->lz_probs, LZMS_NUM_LZ_PROBS);
2029         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2030                 lzms_init_probability_entries(c->lz_rep_probs[i], LZMS_NUM_LZ_REP_PROBS);
2031         lzms_init_probability_entries(c->delta_probs, LZMS_NUM_DELTA_PROBS);
2032         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2033                 lzms_init_probability_entries(c->delta_rep_probs[i], LZMS_NUM_DELTA_REP_PROBS);
2034 }
2035
2036 static void
2037 lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots)
2038 {
2039         lzms_init_huffman_code(&c->literal_rebuild_info,
2040                                LZMS_NUM_LITERAL_SYMS,
2041                                LZMS_LITERAL_CODE_REBUILD_FREQ,
2042                                c->literal_codewords,
2043                                c->literal_lens,
2044                                c->literal_freqs);
2045
2046         lzms_init_huffman_code(&c->lz_offset_rebuild_info,
2047                                num_offset_slots,
2048                                LZMS_LZ_OFFSET_CODE_REBUILD_FREQ,
2049                                c->lz_offset_codewords,
2050                                c->lz_offset_lens,
2051                                c->lz_offset_freqs);
2052
2053         lzms_init_huffman_code(&c->length_rebuild_info,
2054                                LZMS_NUM_LENGTH_SYMS,
2055                                LZMS_LENGTH_CODE_REBUILD_FREQ,
2056                                c->length_codewords,
2057                                c->length_lens,
2058                                c->length_freqs);
2059
2060         lzms_init_huffman_code(&c->delta_offset_rebuild_info,
2061                                num_offset_slots,
2062                                LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ,
2063                                c->delta_offset_codewords,
2064                                c->delta_offset_lens,
2065                                c->delta_offset_freqs);
2066
2067         lzms_init_huffman_code(&c->delta_power_rebuild_info,
2068                                LZMS_NUM_DELTA_POWER_SYMS,
2069                                LZMS_DELTA_POWER_CODE_REBUILD_FREQ,
2070                                c->delta_power_codewords,
2071                                c->delta_power_lens,
2072                                c->delta_power_freqs);
2073 }
2074
2075 /*
2076  * Flush the output streams, prepare the final compressed data, and return its
2077  * size in bytes.
2078  *
2079  * A return value of 0 indicates that the data could not be compressed to fit in
2080  * the available space.
2081  */
2082 static size_t
2083 lzms_finalize(struct lzms_compressor *c)
2084 {
2085         size_t num_forwards_units;
2086         size_t num_backwards_units;
2087
2088         /* Flush both the forwards and backwards streams, and make sure they
2089          * didn't cross each other and start overwriting each other's data.  */
2090         if (!lzms_output_bitstream_flush(&c->os))
2091                 return 0;
2092
2093         if (!lzms_range_encoder_flush(&c->rc))
2094                 return 0;
2095
2096         if (c->rc.next > c->os.next)
2097                 return 0;
2098
2099         /* Now the compressed buffer contains the data output by the forwards
2100          * bitstream, then empty space, then data output by the backwards
2101          * bitstream.  Move the data output by the backwards bitstream to be
2102          * adjacent to the data output by the forward bitstream, and calculate
2103          * the compressed size that this results in.  */
2104         num_forwards_units = c->rc.next - c->rc.begin;
2105         num_backwards_units = c->rc.end - c->os.next;
2106
2107         memmove(c->rc.next, c->os.next, num_backwards_units * sizeof(le16));
2108
2109         return (num_forwards_units + num_backwards_units) * sizeof(le16);
2110 }
2111
2112 static u64
2113 lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2114                        bool destructive)
2115 {
2116         u64 size = 0;
2117
2118         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2119                 return 0;
2120
2121         size += sizeof(struct lzms_compressor);
2122
2123         if (!destructive)
2124                 size += max_bufsize; /* in_buffer */
2125
2126         /* mf */
2127         size += lcpit_matchfinder_get_needed_memory(max_bufsize);
2128
2129         return size;
2130 }
2131
2132 static int
2133 lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
2134                        bool destructive, void **c_ret)
2135 {
2136         struct lzms_compressor *c;
2137         u32 nice_match_len;
2138
2139         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2140                 return WIMLIB_ERR_INVALID_PARAM;
2141
2142         c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64);
2143         if (!c)
2144                 goto oom0;
2145
2146         c->destructive = destructive;
2147
2148         /* Scale nice_match_len with the compression level.  But to allow an
2149          * optimization for length cost calculations, don't allow nice_match_len
2150          * to exceed MAX_FAST_LENGTH.  */
2151         nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH);
2152
2153         c->use_delta_matches = (compression_level >= 35);
2154         c->try_lzmatch_lit_lzrep0 = (compression_level >= 45);
2155         c->try_lit_lzrep0 = (compression_level >= 60);
2156         c->try_lzrep_lit_lzrep0 = (compression_level >= 60);
2157
2158         if (!c->destructive) {
2159                 c->in_buffer = MALLOC(max_bufsize);
2160                 if (!c->in_buffer)
2161                         goto oom1;
2162         }
2163
2164         if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len))
2165                 goto oom2;
2166
2167         lzms_init_fast_length_slot_tab(c);
2168         lzms_init_offset_slot_tabs(c);
2169
2170         *c_ret = c;
2171         return 0;
2172
2173 oom2:
2174         if (!c->destructive)
2175                 FREE(c->in_buffer);
2176 oom1:
2177         ALIGNED_FREE(c);
2178 oom0:
2179         return WIMLIB_ERR_NOMEM;
2180 }
2181
2182 static size_t
2183 lzms_compress(const void *in, size_t in_nbytes,
2184               void *out, size_t out_nbytes_avail, void *_c)
2185 {
2186         struct lzms_compressor *c = _c;
2187         size_t result;
2188
2189         /* Don't bother trying to compress extremely small inputs.  */
2190         if (in_nbytes < 4)
2191                 return 0;
2192
2193         /* Copy the input data into the internal buffer and preprocess it.  */
2194         if (c->destructive)
2195                 c->in_buffer = (void *)in;
2196         else
2197                 memcpy(c->in_buffer, in, in_nbytes);
2198         c->in_nbytes = in_nbytes;
2199         lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false);
2200
2201         /* Prepare the matchfinders.  */
2202         lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes);
2203         if (c->use_delta_matches)
2204                 lzms_init_delta_matchfinder(c);
2205
2206         /* Initialize the encoder structures.  */
2207         lzms_range_encoder_init(&c->rc, out, out_nbytes_avail / sizeof(le16));
2208         lzms_output_bitstream_init(&c->os, out, out_nbytes_avail / sizeof(le16));
2209         lzms_init_states_and_probabilities(c);
2210         lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes));
2211
2212         /* The main loop: parse and encode.  */
2213         lzms_near_optimal_parse(c);
2214
2215         /* Return the compressed data size or 0.  */
2216         result = lzms_finalize(c);
2217         if (!result && c->destructive)
2218                 lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true);
2219         return result;
2220 }
2221
2222 static void
2223 lzms_free_compressor(void *_c)
2224 {
2225         struct lzms_compressor *c = _c;
2226
2227         if (!c->destructive)
2228                 FREE(c->in_buffer);
2229         lcpit_matchfinder_destroy(&c->mf);
2230         ALIGNED_FREE(c);
2231 }
2232
2233 const struct compressor_ops lzms_compressor_ops = {
2234         .get_needed_memory  = lzms_get_needed_memory,
2235         .create_compressor  = lzms_create_compressor,
2236         .compress           = lzms_compress,
2237         .free_compressor    = lzms_free_compressor,
2238 };