]> wimlib.net Git - wimlib/blob - src/lzms_compress.c
ntfs-3g_apply.c: note bugs fixed in NTFS-3G version 2017.3.23
[wimlib] / src / lzms_compress.c
1 /*
2  * lzms_compress.c
3  *
4  * A compressor for the LZMS compression format.
5  */
6
7 /*
8  * Copyright (C) 2013, 2014, 2015 Eric Biggers
9  *
10  * This file is free software; you can redistribute it and/or modify it under
11  * the terms of the GNU Lesser General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option) any
13  * later version.
14  *
15  * This file is distributed in the hope that it will be useful, but WITHOUT
16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this file; if not, see http://www.gnu.org/licenses/.
22  */
23
24 #ifdef HAVE_CONFIG_H
25 #  include "config.h"
26 #endif
27
28 #include <limits.h>
29 #include <string.h>
30
31 #include "wimlib/compress_common.h"
32 #include "wimlib/compressor_ops.h"
33 #include "wimlib/error.h"
34 #include "wimlib/lcpit_matchfinder.h"
35 #include "wimlib/lz_extend.h"
36 #include "wimlib/lz_hash.h"
37 #include "wimlib/lzms_common.h"
38 #include "wimlib/unaligned.h"
39 #include "wimlib/util.h"
40
41 /*
42  * MAX_FAST_LENGTH is the maximum match length for which the length slot can be
43  * looked up directly in 'fast_length_slot_tab' and the length cost can be
44  * looked up directly in 'fast_length_cost_tab'.
45  *
46  * We also limit the 'nice_match_len' parameter to this value.  Consequently, if
47  * the longest match found is shorter than 'nice_match_len', then it must also
48  * be shorter than MAX_FAST_LENGTH.  This makes it possible to do fast lookups
49  * of length costs using 'fast_length_cost_tab' without having to keep checking
50  * whether the length exceeds MAX_FAST_LENGTH or not.
51  */
52 #define MAX_FAST_LENGTH         255
53
54 /* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will
55  * step forward before forcing the pending items to be encoded.  If this value
56  * is increased, then there will be fewer forced flushes, but the probability
57  * entries and Huffman codes will be more likely to become outdated.  */
58 #define NUM_OPTIM_NODES         2048
59
60 /* COST_SHIFT is a scaling factor that makes it possible to consider fractional
61  * bit costs.  A single bit has a cost of (1 << COST_SHIFT).  */
62 #define COST_SHIFT              6
63
64 /* Length of the hash table for finding delta matches  */
65 #define DELTA_HASH_ORDER        17
66 #define DELTA_HASH_LENGTH       ((u32)1 << DELTA_HASH_ORDER)
67
68 /* The number of bytes to hash when finding delta matches; also taken to be the
69  * minimum length of an explicit offset delta match  */
70 #define NBYTES_HASHED_FOR_DELTA 3
71
72 /* The number of delta match powers to consider (must be <=
73  * LZMS_NUM_DELTA_POWER_SYMS)  */
74 #define NUM_POWERS_TO_CONSIDER  6
75
76 /* This structure tracks the state of writing bits as a series of 16-bit coding
77  * units, starting at the end of the output buffer and proceeding backwards.  */
78 struct lzms_output_bitstream {
79
80         /* Bits that haven't yet been written to the output buffer  */
81         u64 bitbuf;
82
83         /* Number of bits currently held in @bitbuf  */
84         unsigned bitcount;
85
86         /* Pointer to the beginning of the output buffer (this is the "end" when
87          * writing backwards!)  */
88         u8 *begin;
89
90         /* Pointer to just past the next position in the output buffer at which
91          * to output a 16-bit coding unit  */
92         u8 *next;
93 };
94
95 /* This structure tracks the state of range encoding and its output, which
96  * starts at the beginning of the output buffer and proceeds forwards.  */
97 struct lzms_range_encoder {
98
99         /* The lower boundary of the current range.  Logically, this is a 33-bit
100          * integer whose high bit is needed to detect carries.  */
101         u64 lower_bound;
102
103         /* The size of the current range  */
104         u32 range_size;
105
106         /* The next 16-bit coding unit to output  */
107         u16 cache;
108
109         /* The number of 16-bit coding units whose output has been delayed due
110          * to possible carrying.  The first such coding unit is @cache; all
111          * subsequent such coding units are 0xffff.  */
112         u32 cache_size;
113
114         /* Pointer to the beginning of the output buffer  */
115         u8 *begin;
116
117         /* Pointer to the position in the output buffer at which the next coding
118          * unit must be written  */
119         u8 *next;
120
121         /* Pointer to just past the end of the output buffer  */
122         u8 *end;
123 };
124
125 /* Bookkeeping information for an adaptive Huffman code  */
126 struct lzms_huffman_rebuild_info {
127
128         /* The remaining number of symbols to encode until this code must be
129          * rebuilt  */
130         unsigned num_syms_until_rebuild;
131
132         /* The number of symbols in this code  */
133         unsigned num_syms;
134
135         /* The rebuild frequency of this code, in symbols  */
136         unsigned rebuild_freq;
137
138         /* The Huffman codeword of each symbol in this code  */
139         u32 *codewords;
140
141         /* The length of each Huffman codeword, in bits  */
142         u8 *lens;
143
144         /* The frequency of each symbol in this code  */
145         u32 *freqs;
146 };
147
148 /*
149  * The compressor-internal representation of a match or literal.
150  *
151  * Literals have length=1; matches have length > 1.  (We disallow matches of
152  * length 1, even though this is a valid length in LZMS.)
153  *
154  * The source is encoded as follows:
155  *
156  * - Literals: the literal byte itself
157  * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1)
158  * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets
159  * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are
160  *   the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1)
161  * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is
162  *   the index of the (power, raw_offset) pair in recent_delta_pairs
163  */
164 struct lzms_item {
165         u32 length;
166         u32 source;
167 };
168
169 #define DELTA_SOURCE_TAG                ((u32)1 << 31)
170 #define DELTA_SOURCE_POWER_SHIFT        28
171 #define DELTA_SOURCE_RAW_OFFSET_MASK    (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1)
172
173 static _unused_attribute void
174 check_that_powers_fit_in_bitfield(void)
175 {
176         STATIC_ASSERT(LZMS_NUM_DELTA_POWER_SYMS <= (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
177 }
178
179 /* A stripped-down version of the adaptive state in LZMS which excludes the
180  * probability entries and Huffman codes  */
181 struct lzms_adaptive_state {
182
183         /* Recent offsets for LZ matches  */
184         u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1];
185         u32 prev_lz_offset; /* 0 means none */
186         u32 upcoming_lz_offset; /* 0 means none */
187
188         /* Recent (power, raw offset) pairs for delta matches.
189          * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw
190          * offset, and the high bits are the power.  */
191         u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1];
192         u32 prev_delta_pair; /* 0 means none */
193         u32 upcoming_delta_pair; /* 0 means none  */
194
195         /* States for predicting the probabilities of item types  */
196         u8 main_state;
197         u8 match_state;
198         u8 lz_state;
199         u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
200         u8 delta_state;
201         u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
202 } _aligned_attribute(64);
203
204 /*
205  * This structure represents a byte position in the preprocessed input data and
206  * a node in the graph of possible match/literal choices.
207  *
208  * Logically, each incoming edge to this node is labeled with a literal or a
209  * match that can be taken to reach this position from an earlier position; and
210  * each outgoing edge from this node is labeled with a literal or a match that
211  * can be taken to advance from this position to a later position.
212  */
213 struct lzms_optimum_node {
214
215         /*
216          * The cost of the lowest-cost path that has been found to reach this
217          * position.  This can change as progressively lower cost paths are
218          * found to reach this position.
219          */
220         u32 cost;
221 #define INFINITE_COST UINT32_MAX
222
223         /*
224          * @item is the last item that was taken to reach this position to reach
225          * it with the stored @cost.  This can change as progressively lower
226          * cost paths are found to reach this position.
227          *
228          * In some cases we look ahead more than one item.  If we looked ahead n
229          * items to reach this position, then @item is the last item taken,
230          * @extra_items contains the other items ordered from second-to-last to
231          * first, and @num_extra_items is n - 1.
232          */
233         unsigned num_extra_items;
234         struct lzms_item item;
235         struct lzms_item extra_items[2];
236
237         /*
238          * The adaptive state that exists at this position.  This is filled in
239          * lazily, only after the minimum-cost path to this position is found.
240          *
241          * Note: the way the algorithm handles this adaptive state in the
242          * "minimum-cost" parse is actually only an approximation.  It's
243          * possible for the globally optimal, minimum cost path to contain a
244          * prefix, ending at a position, where that path prefix is *not* the
245          * minimum cost path to that position.  This can happen if such a path
246          * prefix results in a different adaptive state which results in lower
247          * costs later.  Although the algorithm does do some heuristic
248          * multi-item lookaheads, it does not solve this problem in general.
249          *
250          * Note: this adaptive state structure also does not include the
251          * probability entries or current Huffman codewords.  Those aren't
252          * maintained per-position and are only updated occasionally.
253          */
254         struct lzms_adaptive_state state;
255 } _aligned_attribute(64);
256
257 /* The main compressor structure  */
258 struct lzms_compressor {
259
260         /* The matchfinder for LZ matches  */
261         struct lcpit_matchfinder mf;
262
263         /* The preprocessed buffer of data being compressed  */
264         u8 *in_buffer;
265
266         /* The number of bytes of data to be compressed, which is the number of
267          * bytes of data in @in_buffer that are actually valid  */
268         size_t in_nbytes;
269
270         /*
271          * Boolean flags to enable consideration of various types of multi-step
272          * operations during parsing.
273          *
274          * Among other cases, multi-step operations can help with gaps where two
275          * matches are separated by a non-matching byte.
276          *
277          * This idea is borrowed from Igor Pavlov's LZMA encoder.
278          */
279         bool try_lit_lzrep0;
280         bool try_lzrep_lit_lzrep0;
281         bool try_lzmatch_lit_lzrep0;
282
283         /*
284          * If true, the compressor can use delta matches.  This slows down
285          * compression.  It improves the compression ratio greatly, slightly, or
286          * not at all, depending on the input data.
287          */
288         bool use_delta_matches;
289
290         /* If true, the compressor need not preserve the input buffer if it
291          * compresses the data successfully.  */
292         bool destructive;
293
294         /* 'last_target_usages' is a large array that is only needed for
295          * preprocessing, so it is in union with fields that don't need to be
296          * initialized until after preprocessing.  */
297         union {
298         struct {
299
300         /* Temporary space to store matches found by the LZ matchfinder  */
301         struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1];
302
303         /* Hash table for finding delta matches  */
304         u32 delta_hash_table[DELTA_HASH_LENGTH];
305
306         /* For each delta power, the hash code for the next sequence  */
307         u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
308
309         /* The per-byte graph nodes for near-optimal parsing  */
310         struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH +
311                                                1 + MAX_FAST_LENGTH];
312
313         /* Table: length => current cost for small match lengths  */
314         u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
315
316         /* Range encoder which outputs to the beginning of the compressed data
317          * buffer, proceeding forwards  */
318         struct lzms_range_encoder rc;
319
320         /* Bitstream which outputs to the end of the compressed data buffer,
321          * proceeding backwards  */
322         struct lzms_output_bitstream os;
323
324         /* States and probability entries for item type disambiguation  */
325         unsigned main_state;
326         unsigned match_state;
327         unsigned lz_state;
328         unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
329         unsigned delta_state;
330         unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
331         struct lzms_probabilites probs;
332
333         /* Huffman codes  */
334
335         struct lzms_huffman_rebuild_info literal_rebuild_info;
336         u32 literal_codewords[LZMS_NUM_LITERAL_SYMS];
337         u8 literal_lens[LZMS_NUM_LITERAL_SYMS];
338         u32 literal_freqs[LZMS_NUM_LITERAL_SYMS];
339
340         struct lzms_huffman_rebuild_info lz_offset_rebuild_info;
341         u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
342         u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
343         u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
344
345         struct lzms_huffman_rebuild_info length_rebuild_info;
346         u32 length_codewords[LZMS_NUM_LENGTH_SYMS];
347         u8 length_lens[LZMS_NUM_LENGTH_SYMS];
348         u32 length_freqs[LZMS_NUM_LENGTH_SYMS];
349
350         struct lzms_huffman_rebuild_info delta_offset_rebuild_info;
351         u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
352         u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
353         u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
354
355         struct lzms_huffman_rebuild_info delta_power_rebuild_info;
356         u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS];
357         u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS];
358         u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS];
359
360         }; /* struct */
361
362         s32 last_target_usages[65536];
363
364         }; /* union */
365
366         /* Table: length => length slot for small match lengths  */
367         u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1];
368
369         /* Tables for mapping offsets to offset slots  */
370
371         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
372         u8 offset_slot_tab_1[0xe4a5];
373
374         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
375         u16 offset_slot_tab_2[0x3d0000 >> 11];
376
377         /* slots [427, 799); 16 <= num_extra_bits  */
378         u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16];
379 };
380
381 /******************************************************************************
382  *                   Offset and length slot acceleration                      *
383  ******************************************************************************/
384
385 /* Generate the acceleration table for length slots.  */
386 static void
387 lzms_init_fast_length_slot_tab(struct lzms_compressor *c)
388 {
389         unsigned slot = 0;
390         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
391                 if (len >= lzms_length_slot_base[slot + 1])
392                         slot++;
393                 c->fast_length_slot_tab[len] = slot;
394         }
395 }
396
397 /* Generate the acceleration tables for offset slots.  */
398 static void
399 lzms_init_offset_slot_tabs(struct lzms_compressor *c)
400 {
401         u32 offset;
402         unsigned slot = 0;
403
404         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
405         for (offset = 1; offset < 0xe4a5; offset++) {
406                 if (offset >= lzms_offset_slot_base[slot + 1])
407                         slot++;
408                 c->offset_slot_tab_1[offset] = slot;
409         }
410
411         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
412         for (; offset < 0x3de4a5; offset += (u32)1 << 11) {
413                 if (offset >= lzms_offset_slot_base[slot + 1])
414                         slot++;
415                 c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot;
416         }
417
418         /* slots [427, 799); 16 <= num_extra_bits  */
419         for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) {
420                 if (offset >= lzms_offset_slot_base[slot + 1])
421                         slot++;
422                 c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot;
423         }
424 }
425
426 /*
427  * Return the length slot for the specified match length, using the compressor's
428  * acceleration table if the length is small enough.
429  */
430 static forceinline unsigned
431 lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
432 {
433         if (likely(length <= MAX_FAST_LENGTH))
434                 return c->fast_length_slot_tab[length];
435         return lzms_get_length_slot(length);
436 }
437
438 /*
439  * Return the offset slot for the specified match offset, using the compressor's
440  * acceleration tables to speed up the mapping.
441  */
442 static forceinline unsigned
443 lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
444 {
445         if (offset < 0xe4a5)
446                 return c->offset_slot_tab_1[offset];
447         offset -= 0xe4a5;
448         if (offset < 0x3d0000)
449                 return c->offset_slot_tab_2[offset >> 11];
450         return c->offset_slot_tab_3[offset >> 16];
451 }
452
453 /******************************************************************************
454  *                             Range encoding                                 *
455  ******************************************************************************/
456
457 /*
458  * Initialize the range encoder @rc to write forwards to the specified buffer
459  * @out that is @size bytes long.
460  */
461 static void
462 lzms_range_encoder_init(struct lzms_range_encoder *rc, u8 *out, size_t size)
463 {
464         rc->lower_bound = 0;
465         rc->range_size = 0xffffffff;
466         rc->cache = 0;
467         rc->cache_size = 1;
468         rc->begin = out;
469         rc->next = out - sizeof(le16);
470         rc->end = out + (size & ~1);
471 }
472
473 /*
474  * Attempt to flush bits from the range encoder.
475  *
476  * The basic idea is that we're writing bits from @rc->lower_bound to the
477  * output.  However, due to carrying, the writing of coding units with the
478  * maximum value, as well as one prior coding unit, must be delayed until it is
479  * determined whether a carry is needed.
480  *
481  * This is based on the public domain code for LZMA written by Igor Pavlov, but
482  * with the following differences:
483  *
484  *      - In LZMS, 16-bit coding units are required rather than 8-bit.
485  *
486  *      - In LZMS, the first coding unit is not ignored by the decompressor, so
487  *        the encoder cannot output a dummy value to that position.
488  */
489 static void
490 lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
491 {
492         if ((u32)(rc->lower_bound) < 0xffff0000 ||
493             (u32)(rc->lower_bound >> 32) != 0)
494         {
495                 /* Carry not needed (rc->lower_bound < 0xffff0000), or carry
496                  * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit
497                  * is 1).  */
498                 do {
499                         if (likely(rc->next >= rc->begin)) {
500                                 if (rc->next != rc->end) {
501                                         put_unaligned_le16(rc->cache +
502                                                            (u16)(rc->lower_bound >> 32),
503                                                            rc->next);
504                                         rc->next += sizeof(le16);
505                                 }
506                         } else {
507                                 rc->next += sizeof(le16);
508                         }
509                         rc->cache = 0xffff;
510                 } while (--rc->cache_size != 0);
511
512                 rc->cache = (rc->lower_bound >> 16) & 0xffff;
513         }
514         ++rc->cache_size;
515         rc->lower_bound = (rc->lower_bound & 0xffff) << 16;
516 }
517
518 static bool
519 lzms_range_encoder_flush(struct lzms_range_encoder *rc)
520 {
521         for (int i = 0; i < 4; i++)
522                 lzms_range_encoder_shift_low(rc);
523         return rc->next != rc->end;
524 }
525
526 /*
527  * Encode the next bit using the range encoder.
528  *
529  * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
530  * bit is 0 rather than 1.
531  */
532 static forceinline void
533 lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
534 {
535         /* Normalize if needed.  */
536         if (rc->range_size <= 0xffff) {
537                 rc->range_size <<= 16;
538                 lzms_range_encoder_shift_low(rc);
539         }
540
541         u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob;
542         if (bit == 0) {
543                 rc->range_size = bound;
544         } else {
545                 rc->lower_bound += bound;
546                 rc->range_size -= bound;
547         }
548 }
549
550 /*
551  * Encode a bit.  This wraps around lzms_range_encode_bit() to handle using and
552  * updating the state and its corresponding probability entry.
553  */
554 static forceinline void
555 lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
556                 struct lzms_probability_entry *probs,
557                 struct lzms_range_encoder *rc)
558 {
559         struct lzms_probability_entry *prob_entry;
560         u32 prob;
561
562         /* Load the probability entry for the current state.  */
563         prob_entry = &probs[*state_p];
564
565         /* Update the state based on the next bit.  */
566         *state_p = ((*state_p << 1) | bit) & (num_states - 1);
567
568         /* Get the probability that the bit is 0.  */
569         prob = lzms_get_probability(prob_entry);
570
571         /* Update the probability entry.  */
572         lzms_update_probability_entry(prob_entry, bit);
573
574         /* Encode the bit using the range encoder.  */
575         lzms_range_encode_bit(rc, bit, prob);
576 }
577
578 /* Helper functions for encoding bits in the various decision classes  */
579
580 static void
581 lzms_encode_main_bit(struct lzms_compressor *c, int bit)
582 {
583         lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
584                         c->probs.main, &c->rc);
585 }
586
587 static void
588 lzms_encode_match_bit(struct lzms_compressor *c, int bit)
589 {
590         lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
591                         c->probs.match, &c->rc);
592 }
593
594 static void
595 lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
596 {
597         lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
598                         c->probs.lz, &c->rc);
599 }
600
601 static void
602 lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
603 {
604         lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
605                         c->probs.lz_rep[idx], &c->rc);
606 }
607
608 static void
609 lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
610 {
611         lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
612                         c->probs.delta, &c->rc);
613 }
614
615 static void
616 lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
617 {
618         lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
619                         c->probs.delta_rep[idx], &c->rc);
620 }
621
622 /******************************************************************************
623  *                   Huffman encoding and verbatim bits                       *
624  ******************************************************************************/
625
626 /*
627  * Initialize the output bitstream @os to write backwards to the specified
628  * buffer @out that is @size bytes long.
629  */
630 static void
631 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
632                            u8 *out, size_t size)
633 {
634         os->bitbuf = 0;
635         os->bitcount = 0;
636         os->begin = out;
637         os->next = out + (size & ~1);
638 }
639
640 /*
641  * Write some bits, contained in the low-order @num_bits bits of @bits, to the
642  * output bitstream @os.
643  *
644  * @max_num_bits is a compile-time constant that specifies the maximum number of
645  * bits that can ever be written at this call site.
646  */
647 static forceinline void
648 lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
649                 const unsigned num_bits, const unsigned max_num_bits)
650 {
651         /* Add the bits to the bit buffer variable.  */
652         os->bitcount += num_bits;
653         os->bitbuf = (os->bitbuf << num_bits) | bits;
654
655         /* Check whether any coding units need to be written.  */
656         while (os->bitcount >= 16) {
657
658                 os->bitcount -= 16;
659
660                 /* Write a coding unit, unless it would underflow the buffer. */
661                 if (os->next != os->begin) {
662                         os->next -= sizeof(le16);
663                         put_unaligned_le16(os->bitbuf >> os->bitcount, os->next);
664                 }
665
666                 /* Optimization for call sites that never write more than 16
667                  * bits at once.  */
668                 if (max_num_bits <= 16)
669                         break;
670         }
671 }
672
673 /*
674  * Flush the output bitstream, ensuring that all bits written to it have been
675  * written to memory.  Return %true if all bits have been output successfully,
676  * or %false if an overrun occurred.
677  */
678 static bool
679 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
680 {
681         if (os->next == os->begin)
682                 return false;
683
684         if (os->bitcount != 0) {
685                 os->next -= sizeof(le16);
686                 put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
687         }
688
689         return true;
690 }
691
692 static void
693 lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
694 {
695         make_canonical_huffman_code(rebuild_info->num_syms,
696                                     LZMS_MAX_CODEWORD_LENGTH,
697                                     rebuild_info->freqs,
698                                     rebuild_info->lens,
699                                     rebuild_info->codewords);
700         rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq;
701 }
702
703 static void
704 lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info,
705                        unsigned num_syms, unsigned rebuild_freq,
706                        u32 *codewords, u8 *lens, u32 *freqs)
707 {
708         rebuild_info->num_syms = num_syms;
709         rebuild_info->rebuild_freq = rebuild_freq;
710         rebuild_info->codewords = codewords;
711         rebuild_info->lens = lens;
712         rebuild_info->freqs = freqs;
713         lzms_init_symbol_frequencies(freqs, num_syms);
714         lzms_build_huffman_code(rebuild_info);
715 }
716
717 static void
718 lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
719 {
720         lzms_build_huffman_code(rebuild_info);
721         lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms);
722 }
723
724 /*
725  * Encode a symbol using the specified Huffman code.  Then, if the Huffman code
726  * needs to be rebuilt, rebuild it and return true; otherwise return false.
727  */
728 static forceinline bool
729 lzms_huffman_encode_symbol(unsigned sym,
730                            const u32 *codewords, const u8 *lens, u32 *freqs,
731                            struct lzms_output_bitstream *os,
732                            struct lzms_huffman_rebuild_info *rebuild_info)
733 {
734         lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH);
735         ++freqs[sym];
736         if (--rebuild_info->num_syms_until_rebuild == 0) {
737                 lzms_rebuild_huffman_code(rebuild_info);
738                 return true;
739         }
740         return false;
741 }
742
743 /* Helper routines to encode symbols using the various Huffman codes  */
744
745 static bool
746 lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym)
747 {
748         return lzms_huffman_encode_symbol(sym, c->literal_codewords,
749                                           c->literal_lens, c->literal_freqs,
750                                           &c->os, &c->literal_rebuild_info);
751 }
752
753 static bool
754 lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym)
755 {
756         return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords,
757                                           c->lz_offset_lens, c->lz_offset_freqs,
758                                           &c->os, &c->lz_offset_rebuild_info);
759 }
760
761 static bool
762 lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym)
763 {
764         return lzms_huffman_encode_symbol(sym, c->length_codewords,
765                                           c->length_lens, c->length_freqs,
766                                           &c->os, &c->length_rebuild_info);
767 }
768
769 static bool
770 lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym)
771 {
772         return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords,
773                                           c->delta_offset_lens, c->delta_offset_freqs,
774                                           &c->os, &c->delta_offset_rebuild_info);
775 }
776
777 static bool
778 lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym)
779 {
780         return lzms_huffman_encode_symbol(sym, c->delta_power_codewords,
781                                           c->delta_power_lens, c->delta_power_freqs,
782                                           &c->os, &c->delta_power_rebuild_info);
783 }
784
785 static void
786 lzms_update_fast_length_costs(struct lzms_compressor *c);
787
788 /*
789  * Encode a match length.  If this causes the Huffman code for length symbols to
790  * be rebuilt, also update the length costs array used by the parser.
791  */
792 static void
793 lzms_encode_length(struct lzms_compressor *c, u32 length)
794 {
795         unsigned slot = lzms_comp_get_length_slot(c, length);
796
797         if (lzms_encode_length_symbol(c, slot))
798                 lzms_update_fast_length_costs(c);
799
800         lzms_write_bits(&c->os, length - lzms_length_slot_base[slot],
801                         lzms_extra_length_bits[slot],
802                         LZMS_MAX_EXTRA_LENGTH_BITS);
803 }
804
805 /* Encode the offset of an LZ match.  */
806 static void
807 lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset)
808 {
809         unsigned slot = lzms_comp_get_offset_slot(c, offset);
810
811         lzms_encode_lz_offset_symbol(c, slot);
812         lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot],
813                         lzms_extra_offset_bits[slot],
814                         LZMS_MAX_EXTRA_OFFSET_BITS);
815 }
816
817 /* Encode the raw offset of a delta match.  */
818 static void
819 lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset)
820 {
821         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
822
823         lzms_encode_delta_offset_symbol(c, slot);
824         lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot],
825                         lzms_extra_offset_bits[slot],
826                         LZMS_MAX_EXTRA_OFFSET_BITS);
827 }
828
829 /******************************************************************************
830  *                             Item encoding                                  *
831  ******************************************************************************/
832
833 /* Encode the specified item, which may be a literal or any type of match.  */
834 static void
835 lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source)
836 {
837         /* Main bit: 0 = literal, 1 = match  */
838         int main_bit = (length > 1);
839         lzms_encode_main_bit(c, main_bit);
840
841         if (!main_bit) {
842                 /* Literal  */
843                 unsigned literal = source;
844                 lzms_encode_literal_symbol(c, literal);
845         } else {
846                 /* Match  */
847
848                 /* Match bit: 0 = LZ match, 1 = delta match  */
849                 int match_bit = (source & DELTA_SOURCE_TAG) != 0;
850                 lzms_encode_match_bit(c, match_bit);
851
852                 if (!match_bit) {
853                         /* LZ match  */
854
855                         /* LZ bit: 0 = explicit offset, 1 = repeat offset  */
856                         int lz_bit = (source < LZMS_NUM_LZ_REPS);
857                         lzms_encode_lz_bit(c, lz_bit);
858
859                         if (!lz_bit) {
860                                 /* Explicit offset LZ match  */
861                                 u32 offset = source - (LZMS_NUM_LZ_REPS - 1);
862                                 lzms_encode_lz_offset(c, offset);
863                         } else {
864                                 /* Repeat offset LZ match  */
865                                 int rep_idx = source;
866                                 for (int i = 0; i < rep_idx; i++)
867                                         lzms_encode_lz_rep_bit(c, 1, i);
868                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
869                                         lzms_encode_lz_rep_bit(c, 0, rep_idx);
870                         }
871                 } else {
872                         /* Delta match  */
873
874                         source &= ~DELTA_SOURCE_TAG;
875
876                         /* Delta bit: 0 = explicit offset, 1 = repeat offset  */
877                         int delta_bit = (source < LZMS_NUM_DELTA_REPS);
878                         lzms_encode_delta_bit(c, delta_bit);
879
880                         if (!delta_bit) {
881                                 /* Explicit offset delta match  */
882                                 u32 power = source >> DELTA_SOURCE_POWER_SHIFT;
883                                 u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) -
884                                                  (LZMS_NUM_DELTA_REPS - 1);
885                                 lzms_encode_delta_power_symbol(c, power);
886                                 lzms_encode_delta_raw_offset(c, raw_offset);
887                         } else {
888                                 /* Repeat offset delta match  */
889                                 int rep_idx = source;
890                                 for (int i = 0; i < rep_idx; i++)
891                                         lzms_encode_delta_rep_bit(c, 1, i);
892                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
893                                         lzms_encode_delta_rep_bit(c, 0, rep_idx);
894                         }
895                 }
896
897                 /* Match length (encoded the same way for any match type)  */
898                 lzms_encode_length(c, length);
899         }
900 }
901
902 /* Encode a list of matches and literals chosen by the parsing algorithm.  */
903 static void
904 lzms_encode_nonempty_item_list(struct lzms_compressor *c,
905                                struct lzms_optimum_node *end_node)
906 {
907         /* Since we've stored at each node the item we took to arrive at that
908          * node, we can trace our chosen path in backwards order.  However, for
909          * encoding we need to trace our chosen path in forwards order.  To make
910          * this possible, the following loop moves the items from their
911          * destination nodes to their source nodes, which effectively reverses
912          * the path.  (Think of it like reversing a singly-linked list.)  */
913         struct lzms_optimum_node *cur_node = end_node;
914         struct lzms_item saved_item = cur_node->item;
915         do {
916                 struct lzms_item item = saved_item;
917                 if (cur_node->num_extra_items > 0) {
918                         /* Handle an arrival via multi-item lookahead.  */
919                         unsigned i = 0;
920                         struct lzms_optimum_node *orig_node = cur_node;
921                         do {
922                                 cur_node -= item.length;
923                                 cur_node->item = item;
924                                 item = orig_node->extra_items[i];
925                         } while (++i != orig_node->num_extra_items);
926                 }
927                 cur_node -= item.length;
928                 saved_item = cur_node->item;
929                 cur_node->item = item;
930         } while (cur_node != c->optimum_nodes);
931
932         /* Now trace the chosen path in forwards order, encoding each item.  */
933         do {
934                 lzms_encode_item(c, cur_node->item.length, cur_node->item.source);
935                 cur_node += cur_node->item.length;
936         } while (cur_node != end_node);
937 }
938
939 static forceinline void
940 lzms_encode_item_list(struct lzms_compressor *c,
941                       struct lzms_optimum_node *end_node)
942 {
943         if (end_node != c->optimum_nodes)
944                 lzms_encode_nonempty_item_list(c, end_node);
945 }
946
947 /******************************************************************************
948  *                             Cost evaluation                                *
949  ******************************************************************************/
950
951 /*
952  * If p is the predicted probability of the next bit being a 0, then the number
953  * of bits required to encode a 0 bit using a binary range encoder is the real
954  * number -log2(p), and the number of bits required to encode a 1 bit is the
955  * real number -log2(1 - p).  To avoid computing either of these expressions at
956  * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from
957  * probability to cost for each possible probability.  Specifically, the array
958  * indices are the numerators of the possible probabilities in LZMS, where the
959  * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the
960  * bit costs multiplied by 1<<COST_SHIFT and rounded to the nearest integer.
961  * Furthermore, the values stored for 0% and 100% probabilities are equal to the
962  * adjacent values, since these probabilities are not actually permitted.  This
963  * allows us to use the num_recent_zero_bits value from the
964  * lzms_probability_entry as the array index without fixing up these two special
965  * cases.
966  */
967 static const u32 lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR + 1] = {
968         384, 384, 320, 283, 256, 235, 219, 204,
969         192, 181, 171, 163, 155, 147, 140, 134,
970         128, 122, 117, 112, 107, 103, 99,  94,
971         91,  87,  83,  80,  76,  73,  70,  67,
972         64,  61,  58,  56,  53,  51,  48,  46,
973         43,  41,  39,  37,  35,  33,  30,  29,
974         27,  25,  23,  21,  19,  17,  16,  14,
975         12,  11,  9,   8,   6,   4,   3,   1,
976         1
977 };
978
979 static _unused_attribute void
980 check_cost_shift(void)
981 {
982         /* lzms_bit_costs is hard-coded to the current COST_SHIFT.  */
983         STATIC_ASSERT(COST_SHIFT == 6);
984 }
985
986 #if 0
987 #include <math.h>
988
989 static void
990 lzms_compute_bit_costs(void)
991 {
992         for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) {
993                 u32 prob = i;
994                 if (prob == 0)
995                         prob++;
996                 else if (prob == LZMS_PROBABILITY_DENOMINATOR)
997                         prob--;
998
999                 lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) *
1000                                           (1 << COST_SHIFT));
1001         }
1002 }
1003 #endif
1004
1005 /* Return the cost to encode a 0 bit in the specified context.  */
1006 static forceinline u32
1007 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
1008 {
1009         return lzms_bit_costs[probs[state].num_recent_zero_bits];
1010 }
1011
1012 /* Return the cost to encode a 1 bit in the specified context.  */
1013 static forceinline u32
1014 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
1015 {
1016         return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
1017                               probs[state].num_recent_zero_bits];
1018 }
1019
1020 /* Return the cost to encode a literal, including the main bit.  */
1021 static forceinline u32
1022 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
1023 {
1024         return lzms_bit_0_cost(main_state, c->probs.main) +
1025                 ((u32)c->literal_lens[literal] << COST_SHIFT);
1026 }
1027
1028 /* Update 'fast_length_cost_tab' to use the latest Huffman code.  */
1029 static void
1030 lzms_update_fast_length_costs(struct lzms_compressor *c)
1031 {
1032         int slot = -1;
1033         u32 cost = 0;
1034         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
1035                 if (len >= lzms_length_slot_base[slot + 1]) {
1036                         slot++;
1037                         cost = (u32)(c->length_lens[slot] +
1038                                      lzms_extra_length_bits[slot]) << COST_SHIFT;
1039                 }
1040                 c->fast_length_cost_tab[len] = cost;
1041         }
1042 }
1043
1044 /* Return the cost to encode the specified match length, which must not exceed
1045  * MAX_FAST_LENGTH.  */
1046 static forceinline u32
1047 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
1048 {
1049         return c->fast_length_cost_tab[length];
1050 }
1051
1052 /* Return the cost to encode the specified LZ match offset.  */
1053 static forceinline u32
1054 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
1055 {
1056         unsigned slot = lzms_comp_get_offset_slot(c, offset);
1057         u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot];
1058         return num_bits << COST_SHIFT;
1059 }
1060
1061 /* Return the cost to encode the specified delta power and raw offset.  */
1062 static forceinline u32
1063 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
1064 {
1065         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
1066         u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] +
1067                        lzms_extra_offset_bits[slot];
1068         return num_bits << COST_SHIFT;
1069 }
1070
1071 /******************************************************************************
1072  *                              Adaptive state                                *
1073  ******************************************************************************/
1074
1075 static void
1076 lzms_init_adaptive_state(struct lzms_adaptive_state *state)
1077 {
1078         for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++)
1079                 state->recent_lz_offsets[i] = i + 1;
1080         state->prev_lz_offset = 0;
1081         state->upcoming_lz_offset = 0;
1082
1083         for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++)
1084                 state->recent_delta_pairs[i] = i + 1;
1085         state->prev_delta_pair = 0;
1086         state->upcoming_delta_pair = 0;
1087
1088         state->main_state = 0;
1089         state->match_state = 0;
1090         state->lz_state = 0;
1091         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
1092                 state->lz_rep_states[i] = 0;
1093         state->delta_state = 0;
1094         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
1095                 state->delta_rep_states[i] = 0;
1096 }
1097
1098 /*
1099  * Update the LRU queues for match sources when advancing by one item.
1100  *
1101  * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more
1102  * complex because:
1103  *      - there are separate queues for LZ and delta matches
1104  *      - updates to the queues are delayed by one encoded item (this prevents
1105  *        sources from being bumped up to index 0 too early)
1106  */
1107 static void
1108 lzms_update_lru_queues(struct lzms_adaptive_state *state)
1109 {
1110         if (state->prev_lz_offset != 0) {
1111                 for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--)
1112                         state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i];
1113                 state->recent_lz_offsets[0] = state->prev_lz_offset;
1114         }
1115         state->prev_lz_offset = state->upcoming_lz_offset;
1116
1117         if (state->prev_delta_pair != 0) {
1118                 for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--)
1119                         state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i];
1120                 state->recent_delta_pairs[0] = state->prev_delta_pair;
1121         }
1122         state->prev_delta_pair = state->upcoming_delta_pair;
1123 }
1124
1125 static forceinline void
1126 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
1127 {
1128         *state_p = ((*state_p << 1) | bit) & (num_states - 1);
1129 }
1130
1131 static forceinline void
1132 lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
1133 {
1134         lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
1135 }
1136
1137 static forceinline void
1138 lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
1139 {
1140         lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
1141 }
1142
1143 static forceinline void
1144 lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
1145 {
1146         lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
1147 }
1148
1149 static forceinline void
1150 lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1151 {
1152         for (int i = 0; i < rep_idx; i++)
1153                 lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS);
1154
1155         if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1156                 lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
1157 }
1158
1159 static forceinline void
1160 lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
1161 {
1162         lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
1163 }
1164
1165 static forceinline void
1166 lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1167 {
1168         for (int i = 0; i < rep_idx; i++)
1169                 lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS);
1170
1171         if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1172                 lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS);
1173 }
1174
1175 /******************************************************************************
1176  *                              Matchfinding                                  *
1177  ******************************************************************************/
1178
1179 /* Note: this code just handles finding delta matches.  The code for finding LZ
1180  * matches is elsewhere.  */
1181
1182
1183 /* Initialize the delta matchfinder for a new input buffer.  */
1184 static void
1185 lzms_init_delta_matchfinder(struct lzms_compressor *c)
1186 {
1187         /* Set all entries to use an invalid power, which will never match.  */
1188         STATIC_ASSERT(NUM_POWERS_TO_CONSIDER < (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
1189         memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
1190
1191         /* Initialize the next hash code for each power.  We can just use zeroes
1192          * initially; it doesn't really matter.  */
1193         for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++)
1194                 c->next_delta_hashes[i] = 0;
1195 }
1196
1197 /*
1198  * Compute a DELTA_HASH_ORDER-bit hash code for the first
1199  * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
1200  * delta context with the specified @span.
1201  */
1202 static forceinline u32
1203 lzms_delta_hash(const u8 *p, const u32 pos, u32 span)
1204 {
1205         /* A delta match has a certain span and an offset that is a multiple of
1206          * that span.  To reduce wasted space we use a single combined hash
1207          * table for all spans and positions, but to minimize collisions we
1208          * include in the hash code computation the span and the low-order bits
1209          * of the current position.  */
1210
1211         STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1212         u8 d0 = *(p + 0) - *(p + 0 - span);
1213         u8 d1 = *(p + 1) - *(p + 1 - span);
1214         u8 d2 = *(p + 2) - *(p + 2 - span);
1215         u32 v = ((span + (pos & (span - 1))) << 24) |
1216                 ((u32)d2 << 16) | ((u32)d1 << 8) | d0;
1217         return lz_hash(v, DELTA_HASH_ORDER);
1218 }
1219
1220 /*
1221  * Given a match between @in_next and @matchptr in a delta context with the
1222  * specified @span and having the initial @len, extend the match as far as
1223  * possible, up to a limit of @max_len.
1224  */
1225 static forceinline u32
1226 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
1227                         u32 len, u32 max_len, u32 span)
1228 {
1229         while (len < max_len &&
1230                (u8)(*(in_next + len) - *(in_next + len - span)) ==
1231                (u8)(*(matchptr + len) - *(matchptr + len - span)))
1232         {
1233                 len++;
1234         }
1235         return len;
1236 }
1237
1238 static void
1239 lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
1240                                   const u8 *in_next, u32 count)
1241 {
1242         u32 pos = in_next - c->in_buffer;
1243         if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1))
1244                 return;
1245         do {
1246                 /* Update the hash table for each power.  */
1247                 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1248                         const u32 span = (u32)1 << power;
1249                         if (unlikely(pos < span))
1250                                 continue;
1251                         const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1252                         const u32 hash = c->next_delta_hashes[power];
1253                         c->delta_hash_table[hash] =
1254                                 (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1255                         c->next_delta_hashes[power] = next_hash;
1256                         prefetchw(&c->delta_hash_table[next_hash]);
1257                 }
1258         } while (in_next++, pos++, --count);
1259 }
1260
1261 /*
1262  * Skip the next @count bytes (don't search for matches at them).  @in_next
1263  * points to the first byte to skip.  The return value is @in_next + count.
1264  */
1265 static const u8 *
1266 lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
1267 {
1268         lcpit_matchfinder_skip_bytes(&c->mf, count);
1269         if (c->use_delta_matches)
1270                 lzms_delta_matchfinder_skip_bytes(c, in_next, count);
1271         return in_next + count;
1272 }
1273
1274 /******************************************************************************
1275  *                          "Near-optimal" parsing                            *
1276  ******************************************************************************/
1277
1278 /*
1279  * The main near-optimal parsing routine.
1280  *
1281  * Briefly, the algorithm does an approximate minimum-cost path search to find a
1282  * "near-optimal" sequence of matches and literals to output, based on the
1283  * current cost model.  The algorithm steps forward, position by position (byte
1284  * by byte), and updates the minimum cost path to reach each later position that
1285  * can be reached using a match or literal from the current position.  This is
1286  * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
1287  * the graph edges are possible matches/literals to code, and the cost of each
1288  * edge is the estimated number of bits (scaled up by COST_SHIFT) that will be
1289  * required to output the corresponding match or literal.  But one difference is
1290  * that we actually compute the lowest-cost path in pieces, where each piece is
1291  * terminated when there are no choices to be made.
1292  *
1293  * The costs of literals and matches are estimated using the range encoder
1294  * states and the semi-adaptive Huffman codes.  Except for range encoding
1295  * states, costs are assumed to be constant throughout a single run of the
1296  * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data.  This
1297  * introduces a source of non-optimality because the probabilities and Huffman
1298  * codes can change over this part of the data.  And of course, there are
1299  * various other reasons why the result isn't optimal in terms of compression
1300  * ratio.
1301  */
1302 static void
1303 lzms_near_optimal_parse(struct lzms_compressor *c)
1304 {
1305         const u8 *in_next = c->in_buffer;
1306         const u8 * const in_end = &c->in_buffer[c->in_nbytes];
1307         struct lzms_optimum_node *cur_node;
1308         struct lzms_optimum_node *end_node;
1309
1310         /* Set initial length costs for lengths <= MAX_FAST_LENGTH.  */
1311         lzms_update_fast_length_costs(c);
1312
1313         /* Set up the initial adaptive state.  */
1314         lzms_init_adaptive_state(&c->optimum_nodes[0].state);
1315
1316 begin:
1317         /* Start building a new list of items, which will correspond to the next
1318          * piece of the overall minimum-cost path.  */
1319
1320         cur_node = c->optimum_nodes;
1321         cur_node->cost = 0;
1322         end_node = cur_node;
1323
1324         if (in_next == in_end)
1325                 return;
1326
1327         /* The following loop runs once for each per byte in the input buffer,
1328          * except in a few shortcut cases.  */
1329         for (;;) {
1330                 u32 num_matches;
1331
1332                 /* Repeat offset LZ matches  */
1333                 if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS &&
1334                            in_end - in_next >= 2))
1335                 {
1336                         for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) {
1337
1338                                 /* Looking for a repeat offset LZ match at queue
1339                                  * index @rep_idx  */
1340
1341                                 const u32 offset = cur_node->state.recent_lz_offsets[rep_idx];
1342                                 const u8 * const matchptr = in_next - offset;
1343
1344                                 /* Check the first 2 bytes before entering the extension loop.  */
1345                                 if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr))
1346                                         continue;
1347
1348                                 /* Extend the match to its full length.  */
1349                                 const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next);
1350
1351                                 /* Early out for long repeat offset LZ match */
1352                                 if (rep_len >= c->mf.nice_match_len) {
1353
1354                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1355
1356                                         lzms_encode_item_list(c, cur_node);
1357                                         lzms_encode_item(c, rep_len, rep_idx);
1358
1359                                         c->optimum_nodes[0].state = cur_node->state;
1360                                         cur_node = &c->optimum_nodes[0];
1361
1362                                         cur_node->state.upcoming_lz_offset =
1363                                                 cur_node->state.recent_lz_offsets[rep_idx];
1364                                         cur_node->state.upcoming_delta_pair = 0;
1365                                         for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1366                                                 cur_node->state.recent_lz_offsets[i] =
1367                                                         cur_node->state.recent_lz_offsets[i + 1];
1368                                         lzms_update_lru_queues(&cur_node->state);
1369                                         lzms_update_main_state(&cur_node->state, 1);
1370                                         lzms_update_match_state(&cur_node->state, 0);
1371                                         lzms_update_lz_state(&cur_node->state, 1);
1372                                         lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1373                                         goto begin;
1374                                 }
1375
1376                                 while (end_node < cur_node + rep_len)
1377                                         (++end_node)->cost = INFINITE_COST;
1378
1379                                 u32 base_cost = cur_node->cost +
1380                                                 lzms_bit_1_cost(cur_node->state.main_state,
1381                                                                 c->probs.main) +
1382                                                 lzms_bit_0_cost(cur_node->state.match_state,
1383                                                                 c->probs.match) +
1384                                                 lzms_bit_1_cost(cur_node->state.lz_state,
1385                                                                 c->probs.lz);
1386
1387                                 for (int i = 0; i < rep_idx; i++)
1388                                         base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
1389                                                                      c->probs.lz_rep[i]);
1390
1391                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1392                                         base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
1393                                                                      c->probs.lz_rep[rep_idx]);
1394
1395                                 u32 len = 2;
1396                                 do {
1397                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1398                                         if (cost < (cur_node + len)->cost) {
1399                                                 (cur_node + len)->cost = cost;
1400                                                 (cur_node + len)->item = (struct lzms_item) {
1401                                                         .length = len,
1402                                                         .source = rep_idx,
1403                                                 };
1404                                                 (cur_node + len)->num_extra_items = 0;
1405                                         }
1406                                 } while (++len <= rep_len);
1407
1408
1409                                 /* try LZ-rep + lit + LZ-rep0  */
1410                                 if (c->try_lzrep_lit_lzrep0 &&
1411                                     in_end - (in_next + rep_len) >= 3 &&
1412                                     load_u16_unaligned(in_next + rep_len + 1) ==
1413                                     load_u16_unaligned(matchptr + rep_len + 1))
1414                                 {
1415                                         const u32 rep0_len = lz_extend(in_next + rep_len + 1,
1416                                                                        matchptr + rep_len + 1,
1417                                                                        2,
1418                                                                        min(c->mf.nice_match_len,
1419                                                                            in_end - (in_next + rep_len + 1)));
1420
1421                                         unsigned main_state = cur_node->state.main_state;
1422                                         unsigned match_state = cur_node->state.match_state;
1423                                         unsigned lz_state = cur_node->state.lz_state;
1424                                         unsigned lz_rep0_state = cur_node->state.lz_rep_states[0];
1425
1426                                         /* update states for LZ-rep  */
1427                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1428                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1429                                         lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS;
1430                                         lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) %
1431                                                                 LZMS_NUM_LZ_REP_PROBS;
1432
1433                                         /* LZ-rep cost  */
1434                                         u32 cost = base_cost + lzms_fast_length_cost(c, rep_len);
1435
1436                                         /* add literal cost  */
1437                                         cost += lzms_literal_cost(c, main_state, *(in_next + rep_len));
1438
1439                                         /* update state for literal  */
1440                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1441
1442                                         /* add LZ-rep0 cost  */
1443                                         cost += lzms_bit_1_cost(main_state, c->probs.main) +
1444                                                 lzms_bit_0_cost(match_state, c->probs.match) +
1445                                                 lzms_bit_1_cost(lz_state, c->probs.lz) +
1446                                                 lzms_bit_0_cost(lz_rep0_state, c->probs.lz_rep[0]) +
1447                                                 lzms_fast_length_cost(c, rep0_len);
1448
1449                                         const u32 total_len = rep_len + 1 + rep0_len;
1450
1451                                         while (end_node < cur_node + total_len)
1452                                                 (++end_node)->cost = INFINITE_COST;
1453
1454                                         if (cost < (cur_node + total_len)->cost) {
1455                                                 (cur_node + total_len)->cost = cost;
1456                                                 (cur_node + total_len)->item = (struct lzms_item) {
1457                                                         .length = rep0_len,
1458                                                         .source = 0,
1459                                                 };
1460                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1461                                                         .length = 1,
1462                                                         .source = *(in_next + rep_len),
1463                                                 };
1464                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1465                                                         .length = rep_len,
1466                                                         .source = rep_idx,
1467                                                 };
1468                                                 (cur_node + total_len)->num_extra_items = 2;
1469                                         }
1470                                 }
1471                         }
1472                 }
1473
1474                 /* Repeat offset delta matches  */
1475                 if (c->use_delta_matches &&
1476                     likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 &&
1477                            (in_end - in_next >= 2)))
1478                 {
1479                         for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) {
1480
1481                                 /* Looking for a repeat offset delta match at
1482                                  * queue index @rep_idx  */
1483
1484                                 const u32 pair = cur_node->state.recent_delta_pairs[rep_idx];
1485                                 const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT;
1486                                 const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK;
1487                                 const u32 span = (u32)1 << power;
1488                                 const u32 offset = raw_offset << power;
1489                                 const u8 * const matchptr = in_next - offset;
1490
1491                                 /* Check the first 2 bytes before entering the
1492                                  * extension loop.  */
1493                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1494                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1495                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1496                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))))
1497                                         continue;
1498
1499                                 /* Extend the match to its full length.  */
1500                                 const u32 rep_len = lzms_extend_delta_match(in_next, matchptr,
1501                                                                             2, in_end - in_next,
1502                                                                             span);
1503
1504                                 /* Early out for long repeat offset delta match */
1505                                 if (rep_len >= c->mf.nice_match_len) {
1506
1507                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1508
1509                                         lzms_encode_item_list(c, cur_node);
1510                                         lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx);
1511
1512                                         c->optimum_nodes[0].state = cur_node->state;
1513                                         cur_node = &c->optimum_nodes[0];
1514
1515                                         cur_node->state.upcoming_delta_pair = pair;
1516                                         cur_node->state.upcoming_lz_offset = 0;
1517                                         for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1518                                                 cur_node->state.recent_delta_pairs[i] =
1519                                                         cur_node->state.recent_delta_pairs[i + 1];
1520                                         lzms_update_lru_queues(&cur_node->state);
1521                                         lzms_update_main_state(&cur_node->state, 1);
1522                                         lzms_update_match_state(&cur_node->state, 1);
1523                                         lzms_update_delta_state(&cur_node->state, 1);
1524                                         lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1525                                         goto begin;
1526                                 }
1527
1528                                 while (end_node < cur_node + rep_len)
1529                                         (++end_node)->cost = INFINITE_COST;
1530
1531                                 u32 base_cost = cur_node->cost +
1532                                                 lzms_bit_1_cost(cur_node->state.main_state,
1533                                                                 c->probs.main) +
1534                                                 lzms_bit_1_cost(cur_node->state.match_state,
1535                                                                 c->probs.match) +
1536                                                 lzms_bit_1_cost(cur_node->state.delta_state,
1537                                                                 c->probs.delta);
1538
1539                                 for (int i = 0; i < rep_idx; i++)
1540                                         base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
1541                                                                      c->probs.delta_rep[i]);
1542
1543                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1544                                         base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
1545                                                                      c->probs.delta_rep[rep_idx]);
1546
1547                                 u32 len = 2;
1548                                 do {
1549                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1550                                         if (cost < (cur_node + len)->cost) {
1551                                                 (cur_node + len)->cost = cost;
1552                                                 (cur_node + len)->item = (struct lzms_item) {
1553                                                         .length = len,
1554                                                         .source = DELTA_SOURCE_TAG | rep_idx,
1555                                                 };
1556                                                 (cur_node + len)->num_extra_items = 0;
1557                                         }
1558                                 } while (++len <= rep_len);
1559                         }
1560                 }
1561
1562                 /* Explicit offset LZ matches  */
1563                 num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches);
1564                 if (num_matches) {
1565
1566                         u32 best_len = c->matches[0].length;
1567
1568                         /* Early out for long explicit offset LZ match  */
1569                         if (best_len >= c->mf.nice_match_len) {
1570
1571                                 const u32 offset = c->matches[0].offset;
1572
1573                                 /* Extend the match as far as possible.
1574                                  * This is necessary because the LCP-interval
1575                                  * tree matchfinder only reports up to
1576                                  * nice_match_len bytes.  */
1577                                 best_len = lz_extend(in_next, in_next - offset,
1578                                                      best_len, in_end - in_next);
1579
1580                                 in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1);
1581
1582                                 lzms_encode_item_list(c, cur_node);
1583                                 lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1);
1584
1585                                 c->optimum_nodes[0].state = cur_node->state;
1586                                 cur_node = &c->optimum_nodes[0];
1587
1588                                 cur_node->state.upcoming_lz_offset = offset;
1589                                 cur_node->state.upcoming_delta_pair = 0;
1590                                 lzms_update_lru_queues(&cur_node->state);
1591                                 lzms_update_main_state(&cur_node->state, 1);
1592                                 lzms_update_match_state(&cur_node->state, 0);
1593                                 lzms_update_lz_state(&cur_node->state, 0);
1594                                 goto begin;
1595                         }
1596
1597                         while (end_node < cur_node + best_len)
1598                                 (++end_node)->cost = INFINITE_COST;
1599
1600                         u32 base_cost = cur_node->cost +
1601                                         lzms_bit_1_cost(cur_node->state.main_state,
1602                                                         c->probs.main) +
1603                                         lzms_bit_0_cost(cur_node->state.match_state,
1604                                                         c->probs.match) +
1605                                         lzms_bit_0_cost(cur_node->state.lz_state,
1606                                                         c->probs.lz);
1607
1608                         if (c->try_lzmatch_lit_lzrep0 &&
1609                             likely(in_end - (in_next + c->matches[0].length) >= 3))
1610                         {
1611                                 /* try LZ-match + lit + LZ-rep0  */
1612
1613                                 u32 l = 2;
1614                                 u32 i = num_matches - 1;
1615                                 do {
1616                                         const u32 len = c->matches[i].length;
1617                                         const u32 offset = c->matches[i].offset;
1618                                         const u32 position_cost = base_cost +
1619                                                                   lzms_lz_offset_cost(c, offset);
1620                                         do {
1621                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1622                                                 if (cost < (cur_node + l)->cost) {
1623                                                         (cur_node + l)->cost = cost;
1624                                                         (cur_node + l)->item = (struct lzms_item) {
1625                                                                 .length = l,
1626                                                                 .source = offset + (LZMS_NUM_LZ_REPS - 1),
1627                                                         };
1628                                                         (cur_node + l)->num_extra_items = 0;
1629                                                 }
1630                                         } while (++l <= len);
1631
1632                                         const u8 * const matchptr = in_next - offset;
1633                                         if (load_u16_unaligned(matchptr + len + 1) !=
1634                                             load_u16_unaligned(in_next + len + 1))
1635                                                 continue;
1636
1637                                         const u32 rep0_len = lz_extend(in_next + len + 1,
1638                                                                        matchptr + len + 1,
1639                                                                        2,
1640                                                                        min(c->mf.nice_match_len,
1641                                                                            in_end - (in_next + len + 1)));
1642
1643                                         unsigned main_state = cur_node->state.main_state;
1644                                         unsigned match_state = cur_node->state.match_state;
1645                                         unsigned lz_state = cur_node->state.lz_state;
1646
1647                                         /* update states for LZ-match  */
1648                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1649                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1650                                         lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS;
1651
1652                                         /* LZ-match cost  */
1653                                         u32 cost = position_cost + lzms_fast_length_cost(c, len);
1654
1655                                         /* add literal cost  */
1656                                         cost += lzms_literal_cost(c, main_state, *(in_next + len));
1657
1658                                         /* update state for literal  */
1659                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1660
1661                                         /* add LZ-rep0 cost  */
1662                                         cost += lzms_bit_1_cost(main_state, c->probs.main) +
1663                                                 lzms_bit_0_cost(match_state, c->probs.match) +
1664                                                 lzms_bit_1_cost(lz_state, c->probs.lz) +
1665                                                 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1666                                                                 c->probs.lz_rep[0]) +
1667                                                 lzms_fast_length_cost(c, rep0_len);
1668
1669                                         const u32 total_len = len + 1 + rep0_len;
1670
1671                                         while (end_node < cur_node + total_len)
1672                                                 (++end_node)->cost = INFINITE_COST;
1673
1674                                         if (cost < (cur_node + total_len)->cost) {
1675                                                 (cur_node + total_len)->cost = cost;
1676                                                 (cur_node + total_len)->item = (struct lzms_item) {
1677                                                         .length = rep0_len,
1678                                                         .source = 0,
1679                                                 };
1680                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1681                                                         .length = 1,
1682                                                         .source = *(in_next + len),
1683                                                 };
1684                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1685                                                         .length = len,
1686                                                         .source = offset + LZMS_NUM_LZ_REPS - 1,
1687                                                 };
1688                                                 (cur_node + total_len)->num_extra_items = 2;
1689                                         }
1690                                 } while (i--);
1691                         } else {
1692                                 u32 l = 2;
1693                                 u32 i = num_matches - 1;
1694                                 do {
1695                                         u32 position_cost = base_cost +
1696                                                             lzms_lz_offset_cost(c, c->matches[i].offset);
1697                                         do {
1698                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1699                                                 if (cost < (cur_node + l)->cost) {
1700                                                         (cur_node + l)->cost = cost;
1701                                                         (cur_node + l)->item = (struct lzms_item) {
1702                                                                 .length = l,
1703                                                                 .source = c->matches[i].offset +
1704                                                                           (LZMS_NUM_LZ_REPS - 1),
1705                                                         };
1706                                                         (cur_node + l)->num_extra_items = 0;
1707                                                 }
1708                                         } while (++l <= c->matches[i].length);
1709                                 } while (i--);
1710                         }
1711                 }
1712
1713                 /* Explicit offset delta matches  */
1714                 if (c->use_delta_matches &&
1715                     likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1))
1716                 {
1717                         const u32 pos = in_next - c->in_buffer;
1718
1719                         /* Consider each possible power (log2 of span)  */
1720                         STATIC_ASSERT(NUM_POWERS_TO_CONSIDER <= LZMS_NUM_DELTA_POWER_SYMS);
1721                         for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1722
1723                                 const u32 span = (u32)1 << power;
1724
1725                                 if (unlikely(pos < span))
1726                                         continue;
1727
1728                                 const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1729                                 const u32 hash = c->next_delta_hashes[power];
1730                                 const u32 cur_match = c->delta_hash_table[hash];
1731
1732                                 c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1733                                 c->next_delta_hashes[power] = next_hash;
1734                                 prefetchw(&c->delta_hash_table[next_hash]);
1735
1736                                 if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
1737                                         continue;
1738
1739                                 const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK);
1740
1741                                 /* The offset must be a multiple of span.  */
1742                                 if (offset & (span - 1))
1743                                         continue;
1744
1745                                 const u8 * const matchptr = in_next - offset;
1746
1747                                 /* Check the first 3 bytes before entering the
1748                                  * extension loop.  */
1749                                 STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1750                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1751                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1752                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1753                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) ||
1754                                     ((u8)(*(in_next + 2) - *(in_next + 2 - span)) !=
1755                                      (u8)(*(matchptr + 2) - *(matchptr + 2 - span))))
1756                                         continue;
1757
1758                                 /* Extend the delta match to its full length.  */
1759                                 const u32 len = lzms_extend_delta_match(in_next,
1760                                                                         matchptr,
1761                                                                         NBYTES_HASHED_FOR_DELTA,
1762                                                                         in_end - in_next,
1763                                                                         span);
1764
1765                                 const u32 raw_offset = offset >> power;
1766
1767                                 if (unlikely(raw_offset > DELTA_SOURCE_RAW_OFFSET_MASK -
1768                                                           (LZMS_NUM_DELTA_REPS - 1)))
1769                                         continue;
1770
1771                                 const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
1772                                                  raw_offset;
1773                                 const u32 source = DELTA_SOURCE_TAG |
1774                                                    (pair + LZMS_NUM_DELTA_REPS - 1);
1775
1776                                 /* Early out for long explicit offset delta match  */
1777                                 if (len >= c->mf.nice_match_len) {
1778
1779                                         in_next = lzms_skip_bytes(c, len - 1, in_next + 1);
1780
1781                                         lzms_encode_item_list(c, cur_node);
1782                                         lzms_encode_item(c, len, source);
1783
1784                                         c->optimum_nodes[0].state = cur_node->state;
1785                                         cur_node = &c->optimum_nodes[0];
1786
1787                                         cur_node->state.upcoming_lz_offset = 0;
1788                                         cur_node->state.upcoming_delta_pair = pair;
1789                                         lzms_update_lru_queues(&cur_node->state);
1790                                         lzms_update_main_state(&cur_node->state, 1);
1791                                         lzms_update_match_state(&cur_node->state, 1);
1792                                         lzms_update_delta_state(&cur_node->state, 0);
1793                                         goto begin;
1794                                 }
1795
1796                                 while (end_node < cur_node + len)
1797                                         (++end_node)->cost = INFINITE_COST;
1798
1799                                 u32 base_cost = cur_node->cost +
1800                                                 lzms_bit_1_cost(cur_node->state.main_state,
1801                                                                 c->probs.main) +
1802                                                 lzms_bit_1_cost(cur_node->state.match_state,
1803                                                                 c->probs.match) +
1804                                                 lzms_bit_0_cost(cur_node->state.delta_state,
1805                                                                 c->probs.delta) +
1806                                                 lzms_delta_source_cost(c, power, raw_offset);
1807
1808                                 u32 l = NBYTES_HASHED_FOR_DELTA;
1809                                 do {
1810                                         u32 cost = base_cost + lzms_fast_length_cost(c, l);
1811                                         if (cost < (cur_node + l)->cost) {
1812                                                 (cur_node + l)->cost = cost;
1813                                                 (cur_node + l)->item = (struct lzms_item) {
1814                                                         .length = l,
1815                                                         .source = source,
1816                                                 };
1817                                                 (cur_node + l)->num_extra_items = 0;
1818                                         }
1819                                 } while (++l <= len);
1820                         }
1821                 }
1822
1823                 /* Literal  */
1824                 if (end_node < cur_node + 1)
1825                         (++end_node)->cost = INFINITE_COST;
1826                 const u32 cur_and_lit_cost = cur_node->cost +
1827                                              lzms_literal_cost(c, cur_node->state.main_state,
1828                                                                *in_next);
1829                 if (cur_and_lit_cost < (cur_node + 1)->cost) {
1830                         (cur_node + 1)->cost = cur_and_lit_cost;
1831                         (cur_node + 1)->item = (struct lzms_item) {
1832                                 .length = 1,
1833                                 .source = *in_next,
1834                         };
1835                         (cur_node + 1)->num_extra_items = 0;
1836                 } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) {
1837                         /* try lit + LZ-rep0  */
1838                         const u32 offset =
1839                                 (cur_node->state.prev_lz_offset) ?
1840                                         cur_node->state.prev_lz_offset :
1841                                         cur_node->state.recent_lz_offsets[0];
1842
1843                         if (load_u16_unaligned(in_next + 1) ==
1844                             load_u16_unaligned(in_next + 1 - offset))
1845                         {
1846                                 const u32 rep0_len = lz_extend(in_next + 1,
1847                                                                in_next + 1 - offset,
1848                                                                2,
1849                                                                min(in_end - (in_next + 1),
1850                                                                    c->mf.nice_match_len));
1851
1852                                 unsigned main_state = cur_node->state.main_state;
1853
1854                                 /* Update main_state after literal  */
1855                                 main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS;
1856
1857                                 /* Add cost of LZ-rep0  */
1858                                 const u32 cost = cur_and_lit_cost +
1859                                                  lzms_bit_1_cost(main_state, c->probs.main) +
1860                                                  lzms_bit_0_cost(cur_node->state.match_state,
1861                                                                  c->probs.match) +
1862                                                  lzms_bit_1_cost(cur_node->state.lz_state,
1863                                                                  c->probs.lz) +
1864                                                  lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1865                                                                  c->probs.lz_rep[0]) +
1866                                                  lzms_fast_length_cost(c, rep0_len);
1867
1868                                 const u32 total_len = 1 + rep0_len;
1869
1870                                 while (end_node < cur_node + total_len)
1871                                         (++end_node)->cost = INFINITE_COST;
1872
1873                                 if (cost < (cur_node + total_len)->cost) {
1874                                         (cur_node + total_len)->cost = cost;
1875                                         (cur_node + total_len)->item = (struct lzms_item) {
1876                                                 .length = rep0_len,
1877                                                 .source = 0,
1878                                         };
1879                                         (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1880                                                 .length = 1,
1881                                                 .source = *in_next,
1882                                         };
1883                                         (cur_node + total_len)->num_extra_items = 1;
1884                                 }
1885                         }
1886                 }
1887
1888                 /* Advance to the next position.  */
1889                 in_next++;
1890                 cur_node++;
1891
1892                 /* The lowest-cost path to the current position is now known.
1893                  * Finalize the adaptive state that results from taking this
1894                  * lowest-cost path.  */
1895                 struct lzms_item item_to_take = cur_node->item;
1896                 struct lzms_optimum_node *source_node = cur_node - item_to_take.length;
1897                 int next_item_idx = -1;
1898                 for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
1899                         item_to_take = cur_node->extra_items[i];
1900                         source_node -= item_to_take.length;
1901                         next_item_idx++;
1902                 }
1903                 cur_node->state = source_node->state;
1904                 for (;;) {
1905                         const u32 length = item_to_take.length;
1906                         u32 source = item_to_take.source;
1907
1908                         cur_node->state.upcoming_lz_offset = 0;
1909                         cur_node->state.upcoming_delta_pair = 0;
1910                         if (length > 1) {
1911                                 /* Match  */
1912
1913                                 lzms_update_main_state(&cur_node->state, 1);
1914
1915                                 if (source & DELTA_SOURCE_TAG) {
1916                                         /* Delta match  */
1917
1918                                         lzms_update_match_state(&cur_node->state, 1);
1919                                         source &= ~DELTA_SOURCE_TAG;
1920
1921                                         if (source >= LZMS_NUM_DELTA_REPS) {
1922                                                 /* Explicit offset delta match  */
1923                                                 lzms_update_delta_state(&cur_node->state, 0);
1924                                                 cur_node->state.upcoming_delta_pair =
1925                                                         source - (LZMS_NUM_DELTA_REPS - 1);
1926                                         } else {
1927                                                 /* Repeat offset delta match  */
1928                                                 int rep_idx = source;
1929
1930                                                 lzms_update_delta_state(&cur_node->state, 1);
1931                                                 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1932
1933                                                 cur_node->state.upcoming_delta_pair =
1934                                                         cur_node->state.recent_delta_pairs[rep_idx];
1935
1936                                                 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1937                                                         cur_node->state.recent_delta_pairs[i] =
1938                                                                 cur_node->state.recent_delta_pairs[i + 1];
1939                                         }
1940                                 } else {
1941                                         lzms_update_match_state(&cur_node->state, 0);
1942
1943                                         if (source >= LZMS_NUM_LZ_REPS) {
1944                                                 /* Explicit offset LZ match  */
1945                                                 lzms_update_lz_state(&cur_node->state, 0);
1946                                                 cur_node->state.upcoming_lz_offset =
1947                                                         source - (LZMS_NUM_LZ_REPS - 1);
1948                                         } else {
1949                                                 /* Repeat offset LZ match  */
1950                                                 int rep_idx = source;
1951
1952                                                 lzms_update_lz_state(&cur_node->state, 1);
1953                                                 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1954
1955                                                 cur_node->state.upcoming_lz_offset =
1956                                                         cur_node->state.recent_lz_offsets[rep_idx];
1957
1958                                                 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1959                                                         cur_node->state.recent_lz_offsets[i] =
1960                                                                 cur_node->state.recent_lz_offsets[i + 1];
1961                                         }
1962                                 }
1963                         } else {
1964                                 /* Literal  */
1965                                 lzms_update_main_state(&cur_node->state, 0);
1966                         }
1967
1968                         lzms_update_lru_queues(&cur_node->state);
1969
1970                         if (next_item_idx < 0)
1971                                 break;
1972                         if (next_item_idx == 0)
1973                                 item_to_take = cur_node->item;
1974                         else
1975                                 item_to_take = cur_node->extra_items[next_item_idx - 1];
1976                         --next_item_idx;
1977                 }
1978
1979                 /*
1980                  * This loop will terminate when either of the following
1981                  * conditions is true:
1982                  *
1983                  * (1) cur_node == end_node
1984                  *
1985                  *      There are no paths that extend beyond the current
1986                  *      position.  In this case, any path to a later position
1987                  *      must pass through the current position, so we can go
1988                  *      ahead and choose the list of items that led to this
1989                  *      position.
1990                  *
1991                  * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]
1992                  *
1993                  *      This bounds the number of times the algorithm can step
1994                  *      forward before it is guaranteed to start choosing items.
1995                  *      This limits the memory usage.  It also guarantees that
1996                  *      the parser will not go too long without updating the
1997                  *      probability tables.
1998                  *
1999                  * Note: no check for end-of-buffer is needed because
2000                  * end-of-buffer will trigger condition (1).
2001                  */
2002                 if (cur_node == end_node ||
2003                     cur_node == &c->optimum_nodes[NUM_OPTIM_NODES])
2004                 {
2005                         lzms_encode_nonempty_item_list(c, cur_node);
2006                         c->optimum_nodes[0].state = cur_node->state;
2007                         goto begin;
2008                 }
2009         }
2010 }
2011
2012 static void
2013 lzms_init_states_and_probabilities(struct lzms_compressor *c)
2014 {
2015         c->main_state = 0;
2016         c->match_state = 0;
2017         c->lz_state = 0;
2018         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2019                 c->lz_rep_states[i] = 0;
2020         c->delta_state = 0;
2021         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2022                 c->delta_rep_states[i] = 0;
2023
2024         lzms_init_probabilities(&c->probs);
2025 }
2026
2027 static void
2028 lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots)
2029 {
2030         lzms_init_huffman_code(&c->literal_rebuild_info,
2031                                LZMS_NUM_LITERAL_SYMS,
2032                                LZMS_LITERAL_CODE_REBUILD_FREQ,
2033                                c->literal_codewords,
2034                                c->literal_lens,
2035                                c->literal_freqs);
2036
2037         lzms_init_huffman_code(&c->lz_offset_rebuild_info,
2038                                num_offset_slots,
2039                                LZMS_LZ_OFFSET_CODE_REBUILD_FREQ,
2040                                c->lz_offset_codewords,
2041                                c->lz_offset_lens,
2042                                c->lz_offset_freqs);
2043
2044         lzms_init_huffman_code(&c->length_rebuild_info,
2045                                LZMS_NUM_LENGTH_SYMS,
2046                                LZMS_LENGTH_CODE_REBUILD_FREQ,
2047                                c->length_codewords,
2048                                c->length_lens,
2049                                c->length_freqs);
2050
2051         lzms_init_huffman_code(&c->delta_offset_rebuild_info,
2052                                num_offset_slots,
2053                                LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ,
2054                                c->delta_offset_codewords,
2055                                c->delta_offset_lens,
2056                                c->delta_offset_freqs);
2057
2058         lzms_init_huffman_code(&c->delta_power_rebuild_info,
2059                                LZMS_NUM_DELTA_POWER_SYMS,
2060                                LZMS_DELTA_POWER_CODE_REBUILD_FREQ,
2061                                c->delta_power_codewords,
2062                                c->delta_power_lens,
2063                                c->delta_power_freqs);
2064 }
2065
2066 /*
2067  * Flush the output streams, prepare the final compressed data, and return its
2068  * size in bytes.
2069  *
2070  * A return value of 0 indicates that the data could not be compressed to fit in
2071  * the available space.
2072  */
2073 static size_t
2074 lzms_finalize(struct lzms_compressor *c)
2075 {
2076         size_t num_forwards_bytes;
2077         size_t num_backwards_bytes;
2078
2079         /* Flush both the forwards and backwards streams, and make sure they
2080          * didn't cross each other and start overwriting each other's data.  */
2081         if (!lzms_output_bitstream_flush(&c->os))
2082                 return 0;
2083
2084         if (!lzms_range_encoder_flush(&c->rc))
2085                 return 0;
2086
2087         if (c->rc.next > c->os.next)
2088                 return 0;
2089
2090         /* Now the compressed buffer contains the data output by the forwards
2091          * bitstream, then empty space, then data output by the backwards
2092          * bitstream.  Move the data output by the backwards bitstream to be
2093          * adjacent to the data output by the forward bitstream, and calculate
2094          * the compressed size that this results in.  */
2095         num_forwards_bytes = c->rc.next - c->rc.begin;
2096         num_backwards_bytes = c->rc.end - c->os.next;
2097
2098         memmove(c->rc.next, c->os.next, num_backwards_bytes);
2099
2100         return num_forwards_bytes + num_backwards_bytes;
2101 }
2102
2103 static u64
2104 lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2105                        bool destructive)
2106 {
2107         u64 size = 0;
2108
2109         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2110                 return 0;
2111
2112         size += sizeof(struct lzms_compressor);
2113
2114         if (!destructive)
2115                 size += max_bufsize; /* in_buffer */
2116
2117         /* mf */
2118         size += lcpit_matchfinder_get_needed_memory(max_bufsize);
2119
2120         return size;
2121 }
2122
2123 static int
2124 lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
2125                        bool destructive, void **c_ret)
2126 {
2127         struct lzms_compressor *c;
2128         u32 nice_match_len;
2129
2130         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2131                 return WIMLIB_ERR_INVALID_PARAM;
2132
2133         c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64);
2134         if (!c)
2135                 goto oom0;
2136
2137         c->destructive = destructive;
2138
2139         /* Scale nice_match_len with the compression level.  But to allow an
2140          * optimization for length cost calculations, don't allow nice_match_len
2141          * to exceed MAX_FAST_LENGTH.  */
2142         nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH);
2143
2144         c->use_delta_matches = (compression_level >= 35);
2145         c->try_lzmatch_lit_lzrep0 = (compression_level >= 45);
2146         c->try_lit_lzrep0 = (compression_level >= 60);
2147         c->try_lzrep_lit_lzrep0 = (compression_level >= 60);
2148
2149         if (!c->destructive) {
2150                 c->in_buffer = MALLOC(max_bufsize);
2151                 if (!c->in_buffer)
2152                         goto oom1;
2153         }
2154
2155         if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len))
2156                 goto oom2;
2157
2158         lzms_init_fast_length_slot_tab(c);
2159         lzms_init_offset_slot_tabs(c);
2160
2161         *c_ret = c;
2162         return 0;
2163
2164 oom2:
2165         if (!c->destructive)
2166                 FREE(c->in_buffer);
2167 oom1:
2168         ALIGNED_FREE(c);
2169 oom0:
2170         return WIMLIB_ERR_NOMEM;
2171 }
2172
2173 static size_t
2174 lzms_compress(const void *restrict in, size_t in_nbytes,
2175               void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2176 {
2177         struct lzms_compressor *c = _c;
2178         size_t result;
2179
2180         /* Don't bother trying to compress extremely small inputs.  */
2181         if (in_nbytes < 4)
2182                 return 0;
2183
2184         /* Copy the input data into the internal buffer and preprocess it.  */
2185         if (c->destructive)
2186                 c->in_buffer = (void *)in;
2187         else
2188                 memcpy(c->in_buffer, in, in_nbytes);
2189         c->in_nbytes = in_nbytes;
2190         lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false);
2191
2192         /* Prepare the matchfinders.  */
2193         lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes);
2194         if (c->use_delta_matches)
2195                 lzms_init_delta_matchfinder(c);
2196
2197         /* Initialize the encoder structures.  */
2198         lzms_range_encoder_init(&c->rc, out, out_nbytes_avail);
2199         lzms_output_bitstream_init(&c->os, out, out_nbytes_avail);
2200         lzms_init_states_and_probabilities(c);
2201         lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes));
2202
2203         /* The main loop: parse and encode.  */
2204         lzms_near_optimal_parse(c);
2205
2206         /* Return the compressed data size or 0.  */
2207         result = lzms_finalize(c);
2208         if (!result && c->destructive)
2209                 lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true);
2210         return result;
2211 }
2212
2213 static void
2214 lzms_free_compressor(void *_c)
2215 {
2216         struct lzms_compressor *c = _c;
2217
2218         if (!c->destructive)
2219                 FREE(c->in_buffer);
2220         lcpit_matchfinder_destroy(&c->mf);
2221         ALIGNED_FREE(c);
2222 }
2223
2224 const struct compressor_ops lzms_compressor_ops = {
2225         .get_needed_memory  = lzms_get_needed_memory,
2226         .create_compressor  = lzms_create_compressor,
2227         .compress           = lzms_compress,
2228         .free_compressor    = lzms_free_compressor,
2229 };