]> wimlib.net Git - wimlib/blob - src/lzms_compress.c
Rewrite of LZMS compressor
[wimlib] / src / lzms_compress.c
1 /*
2  * lzms_compress.c
3  *
4  * A compressor for the LZMS compression format.
5  */
6
7 /*
8  * Copyright (C) 2013, 2014, 2015 Eric Biggers
9  *
10  * This file is free software; you can redistribute it and/or modify it under
11  * the terms of the GNU Lesser General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option) any
13  * later version.
14  *
15  * This file is distributed in the hope that it will be useful, but WITHOUT
16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this file; if not, see http://www.gnu.org/licenses/.
22  */
23
24 #ifdef HAVE_CONFIG_H
25 #  include "config.h"
26 #endif
27
28 #include <limits.h>
29 #include <string.h>
30
31 #include "wimlib/compress_common.h"
32 #include "wimlib/compressor_ops.h"
33 #include "wimlib/error.h"
34 #include "wimlib/lcpit_matchfinder.h"
35 #include "wimlib/lz_extend.h"
36 #include "wimlib/lz_hash.h"
37 #include "wimlib/lzms_common.h"
38 #include "wimlib/unaligned.h"
39 #include "wimlib/util.h"
40
41 /*
42  * MAX_FAST_LENGTH is the maximum match length for which the length slot can be
43  * looked up directly in 'fast_length_slot_tab' and the length cost can be
44  * looked up directly in 'fast_length_cost_tab'.
45  *
46  * We also limit the 'nice_match_len' parameter to this value.  Consequently, if
47  * the longest match found is shorter than 'nice_match_len', then it must also
48  * be shorter than MAX_FAST_LENGTH.  This makes it possible to do fast lookups
49  * of length costs using 'fast_length_cost_tab' without having to keep checking
50  * whether the length exceeds MAX_FAST_LENGTH or not.
51  */
52 #define MAX_FAST_LENGTH         255
53
54 /* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will
55  * step forward before forcing the pending items to be encoded.  If this value
56  * is increased, then there will be fewer forced flushes, but the probability
57  * entries and Huffman codes will be more likely to become outdated.  */
58 #define NUM_OPTIM_NODES         2048
59
60 /* COST_SHIFT is a scaling factor that makes it possible to consider fractional
61  * bit costs.  A single bit has a cost of (1 << COST_SHIFT).  */
62 #define COST_SHIFT              6
63
64 /* Length of the hash table for finding delta matches  */
65 #define DELTA_HASH_ORDER        17
66 #define DELTA_HASH_LENGTH       ((u32)1 << DELTA_HASH_ORDER)
67
68 /* The number of bytes to hash when finding delta matches; also taken to be the
69  * minimum length of an explicit offset delta match  */
70 #define NBYTES_HASHED_FOR_DELTA 3
71
72 /* The number of delta match powers to consider (must be <=
73  * LZMS_NUM_DELTA_POWER_SYMS)  */
74 #define NUM_POWERS_TO_CONSIDER  6
75
76 /* This structure tracks the state of writing bits as a series of 16-bit coding
77  * units, starting at the end of the output buffer and proceeding backwards.  */
78 struct lzms_output_bitstream {
79
80         /* Bits that haven't yet been written to the output buffer  */
81         u64 bitbuf;
82
83         /* Number of bits currently held in @bitbuf  */
84         unsigned bitcount;
85
86         /* Pointer to one past the next position in the output buffer at which
87          * to output a 16-bit coding unit  */
88         le16 *next;
89
90         /* Pointer to the beginning of the output buffer (this is the "end" when
91          * writing backwards!)  */
92         le16 *begin;
93 };
94
95 /* This structure tracks the state of range encoding and its output, which
96  * starts at the beginning of the output buffer and proceeds forwards.  */
97 struct lzms_range_encoder {
98
99         /* The lower boundary of the current range.  Logically, this is a 33-bit
100          * integer whose high bit is needed to detect carries.  */
101         u64 lower_bound;
102
103         /* The size of the current range  */
104         u32 range_size;
105
106         /* The next 16-bit coding unit to output  */
107         u16 cache;
108
109         /* The number of 16-bit coding units whose output has been delayed due
110          * to possible carrying.  The first such coding unit is @cache; all
111          * subsequent such coding units are 0xffff.  */
112         u32 cache_size;
113
114         /* Pointer to the beginning of the output buffer  */
115         le16 *begin;
116
117         /* Pointer to the position in the output buffer at which the next coding
118          * unit must be written  */
119         le16 *next;
120
121         /* Pointer to just past the end of the output buffer  */
122         le16 *end;
123 };
124
125 /* Bookkeeping information for an adaptive Huffman code  */
126 struct lzms_huffman_rebuild_info {
127
128         /* The remaining number of symbols to encode until this code must be
129          * rebuilt  */
130         unsigned num_syms_until_rebuild;
131
132         /* The number of symbols in this code  */
133         unsigned num_syms;
134
135         /* The rebuild frequency of this code, in symbols  */
136         unsigned rebuild_freq;
137
138         /* The Huffman codeword of each symbol in this code  */
139         u32 *codewords;
140
141         /* The length of each Huffman codeword, in bits  */
142         u8 *lens;
143
144         /* The frequency of each symbol in this code  */
145         u32 *freqs;
146 };
147
148 /*
149  * The compressor-internal representation of a match or literal.
150  *
151  * Literals have length=1; matches have length > 1.  (We disallow matches of
152  * length 1, even though this is a valid length in LZMS.)
153  *
154  * The source is encoded as follows:
155  *
156  * - Literals: the literal byte itself
157  * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1)
158  * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets
159  * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are
160  *   the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1)
161  * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is
162  *   the index of the (power, raw_offset) pair in recent_delta_pairs
163  */
164 struct lzms_item {
165         u32 length;
166         u32 source;
167 };
168
169 #define DELTA_SOURCE_TAG                ((u32)1 << 31)
170 #define DELTA_SOURCE_POWER_SHIFT        28
171 #define DELTA_SOURCE_RAW_OFFSET_MASK    (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1)
172
173 static inline void
174 check_that_powers_fit_in_bitfield(void)
175 {
176         BUILD_BUG_ON(LZMS_NUM_DELTA_POWER_SYMS > (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
177 }
178
179 /* A stripped-down version of the adaptive state in LZMS which excludes the
180  * probability entries and Huffman codes  */
181 struct lzms_adaptive_state {
182
183         /* Recent offsets for LZ matches  */
184         u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1];
185         u32 prev_lz_offset; /* 0 means none */
186         u32 upcoming_lz_offset; /* 0 means none */
187
188         /* Recent (power, raw offset) pairs for delta matches.
189          * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw
190          * offset, and the high bits are the power.  */
191         u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1];
192         u32 prev_delta_pair; /* 0 means none */
193         u32 upcoming_delta_pair; /* 0 means none  */
194
195         /* States for predicting the probabilities of item types  */
196         u8 main_state;
197         u8 match_state;
198         u8 lz_state;
199         u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
200         u8 delta_state;
201         u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
202 } _aligned_attribute(64);
203
204 /*
205  * This structure represents a byte position in the preprocessed input data and
206  * a node in the graph of possible match/literal choices.
207  *
208  * Logically, each incoming edge to this node is labeled with a literal or a
209  * match that can be taken to reach this position from an earlier position; and
210  * each outgoing edge from this node is labeled with a literal or a match that
211  * can be taken to advance from this position to a later position.
212  */
213 struct lzms_optimum_node {
214
215         /*
216          * The cost of the lowest-cost path that has been found to reach this
217          * position.  This can change as progressively lower cost paths are
218          * found to reach this position.
219          */
220         u32 cost;
221 #define INFINITE_COST UINT32_MAX
222
223         /*
224          * @item is the last item that was taken to reach this position to reach
225          * it with the stored @cost.  This can change as progressively lower
226          * cost paths are found to reach this position.
227          *
228          * In some cases we look ahead more than one item.  If we looked ahead n
229          * items to reach this position, then @item is the last item taken,
230          * @extra_items contains the other items ordered from second-to-last to
231          * first, and @num_extra_items is n - 1.
232          */
233         unsigned num_extra_items;
234         struct lzms_item item;
235         struct lzms_item extra_items[2];
236
237         /*
238          * The adaptive state that exists at this position.  This is filled in
239          * lazily, only after the minimum-cost path to this position is found.
240          *
241          * Note: the way the algorithm handles this adaptive state in the
242          * "minimum-cost" parse is actually only an approximation.  It's
243          * possible for the globally optimal, minimum cost path to contain a
244          * prefix, ending at a position, where that path prefix is *not* the
245          * minimum cost path to that position.  This can happen if such a path
246          * prefix results in a different adaptive state which results in lower
247          * costs later.  Although the algorithm does do some heuristic
248          * multi-item lookaheads, it does not solve this problem in general.
249          *
250          * Note: this adaptive state structure also does not include the
251          * probability entries or current Huffman codewords.  Those aren't
252          * maintained per-position and are only updated occassionally.
253          */
254         struct lzms_adaptive_state state;
255 } _aligned_attribute(64);
256
257 /* The main compressor structure  */
258 struct lzms_compressor {
259
260         /* The matchfinder for LZ matches  */
261         struct lcpit_matchfinder mf;
262
263         /* The preprocessed buffer of data being compressed  */
264         u8 *in_buffer;
265
266         /* The number of bytes of data to be compressed, which is the number of
267          * bytes of data in @in_buffer that are actually valid  */
268         size_t in_nbytes;
269
270         /*
271          * Boolean flags to enable consideration of various types of multi-step
272          * operations during parsing.
273          *
274          * Among other cases, multi-step operations can help with gaps where two
275          * matches are separated by a non-matching byte.
276          *
277          * This idea is borrowed from Igor Pavlov's LZMA encoder.
278          */
279         bool try_lit_lzrep0;
280         bool try_lzrep_lit_lzrep0;
281         bool try_lzmatch_lit_lzrep0;
282
283         /*
284          * If true, the compressor can use delta matches.  This slows down
285          * compression.  It improves the compression ratio greatly, slightly, or
286          * not at all, depending on the input data.
287          */
288         bool use_delta_matches;
289
290         /* 'last_target_usages' is a large array that is only needed for
291          * preprocessing, so it is in union with fields that don't need to be
292          * initialized until after preprocessing.  */
293         union {
294         struct {
295
296         /* Temporary space to store matches found by the LZ matchfinder  */
297         struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1];
298
299         /* Hash table for finding delta matches  */
300         u32 delta_hash_table[DELTA_HASH_LENGTH];
301
302         /* For each delta power, the hash code for the next sequence  */
303         u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
304
305         /* The per-byte graph nodes for near-optimal parsing  */
306         struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH];
307
308         /* Table: length => current cost for small match lengths  */
309         u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
310
311         /* Range encoder which outputs to the beginning of the compressed data
312          * buffer, proceeding forwards  */
313         struct lzms_range_encoder rc;
314
315         /* Bitstream which outputs to the end of the compressed data buffer,
316          * proceeding backwards  */
317         struct lzms_output_bitstream os;
318
319         /* States and probability entries for item type disambiguation  */
320         unsigned main_state;
321         unsigned match_state;
322         unsigned lz_state;
323         unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
324         unsigned delta_state;
325         unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
326         struct lzms_probability_entry main_probs[LZMS_NUM_MAIN_PROBS];
327         struct lzms_probability_entry match_probs[LZMS_NUM_MATCH_PROBS];
328         struct lzms_probability_entry lz_probs[LZMS_NUM_LZ_PROBS];
329         struct lzms_probability_entry lz_rep_probs[LZMS_NUM_LZ_REP_DECISIONS]
330                                                   [LZMS_NUM_LZ_REP_PROBS];
331         struct lzms_probability_entry delta_probs[LZMS_NUM_DELTA_PROBS];
332         struct lzms_probability_entry delta_rep_probs[LZMS_NUM_DELTA_REP_DECISIONS]
333                                                      [LZMS_NUM_DELTA_REP_PROBS];
334
335         /* Huffman codes  */
336
337         struct lzms_huffman_rebuild_info literal_rebuild_info;
338         u32 literal_codewords[LZMS_NUM_LITERAL_SYMS];
339         u8 literal_lens[LZMS_NUM_LITERAL_SYMS];
340         u32 literal_freqs[LZMS_NUM_LITERAL_SYMS];
341
342         struct lzms_huffman_rebuild_info lz_offset_rebuild_info;
343         u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
344         u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
345         u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
346
347         struct lzms_huffman_rebuild_info length_rebuild_info;
348         u32 length_codewords[LZMS_NUM_LENGTH_SYMS];
349         u8 length_lens[LZMS_NUM_LENGTH_SYMS];
350         u32 length_freqs[LZMS_NUM_LENGTH_SYMS];
351
352         struct lzms_huffman_rebuild_info delta_offset_rebuild_info;
353         u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
354         u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
355         u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
356
357         struct lzms_huffman_rebuild_info delta_power_rebuild_info;
358         u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS];
359         u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS];
360         u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS];
361
362         }; /* struct */
363
364         s32 last_target_usages[65536];
365
366         }; /* union */
367
368         /* Table: length => length slot for small match lengths  */
369         u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1];
370
371         /* Tables for mapping offsets to offset slots  */
372
373         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
374         u8 offset_slot_tab_1[0xe4a5];
375
376         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
377         u16 offset_slot_tab_2[0x3d0000 >> 11];
378
379         /* slots [427, 799); 16 <= num_extra_bits  */
380         u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16];
381 };
382
383 /******************************************************************************
384  *                   Offset and length slot acceleration                      *
385  ******************************************************************************/
386
387 /* Generate the acceleration table for length slots.  */
388 static void
389 lzms_init_fast_length_slot_tab(struct lzms_compressor *c)
390 {
391         unsigned slot = 0;
392         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
393                 if (len >= lzms_length_slot_base[slot + 1])
394                         slot++;
395                 c->fast_length_slot_tab[len] = slot;
396         }
397 }
398
399 /* Generate the acceleration tables for offset slots.  */
400 static void
401 lzms_init_offset_slot_tabs(struct lzms_compressor *c)
402 {
403         u32 offset;
404         unsigned slot = 0;
405
406         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
407         for (offset = 1; offset < 0xe4a5; offset++) {
408                 if (offset >= lzms_offset_slot_base[slot + 1])
409                         slot++;
410                 c->offset_slot_tab_1[offset] = slot;
411         }
412
413         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
414         for (; offset < 0x3de4a5; offset += (u32)1 << 11) {
415                 if (offset >= lzms_offset_slot_base[slot + 1])
416                         slot++;
417                 c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot;
418         }
419
420         /* slots [427, 799); 16 <= num_extra_bits  */
421         for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) {
422                 if (offset >= lzms_offset_slot_base[slot + 1])
423                         slot++;
424                 c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot;
425         }
426 }
427
428 /*
429  * Return the length slot for the specified match length, using the compressor's
430  * acceleration table if the length is small enough.
431  */
432 static inline unsigned
433 lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
434 {
435         if (likely(length <= MAX_FAST_LENGTH))
436                 return c->fast_length_slot_tab[length];
437         return lzms_get_length_slot(length);
438 }
439
440 /*
441  * Return the offset slot for the specified match offset, using the compressor's
442  * acceleration tables to speed up the mapping.
443  */
444 static inline unsigned
445 lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
446 {
447         if (offset < 0xe4a5)
448                 return c->offset_slot_tab_1[offset];
449         offset -= 0xe4a5;
450         if (offset < 0x3d0000)
451                 return c->offset_slot_tab_2[offset >> 11];
452         return c->offset_slot_tab_3[offset >> 16];
453 }
454
455 /******************************************************************************
456  *                             Range encoding                                 *
457  ******************************************************************************/
458
459 /*
460  * Initialize the range encoder @rc to write forwards to the specified buffer
461  * @out that is @count 16-bit integers long.
462  */
463 static void
464 lzms_range_encoder_init(struct lzms_range_encoder *rc, le16 *out, size_t count)
465 {
466         rc->lower_bound = 0;
467         rc->range_size = 0xffffffff;
468         rc->cache = 0;
469         rc->cache_size = 1;
470         rc->begin = out;
471         rc->next = out - 1;
472         rc->end = out + count;
473 }
474
475 /*
476  * Attempt to flush bits from the range encoder.
477  *
478  * The basic idea is that we're writing bits from @rc->lower_bound to the
479  * output.  However, due to carrying, the writing of coding units with the
480  * maximum value, as well as one prior coding unit, must be delayed until it is
481  * determined whether a carry is needed.
482  *
483  * This is based on the public domain code for LZMA written by Igor Pavlov, but
484  * with the following differences:
485  *
486  *      - In LZMS, 16-bit coding units are required rather than 8-bit.
487  *
488  *      - In LZMS, the first coding unit is not ignored by the decompressor, so
489  *        the encoder cannot output a dummy value to that position.
490  */
491 static void
492 lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
493 {
494         if ((u32)(rc->lower_bound) < 0xffff0000 ||
495             (u32)(rc->lower_bound >> 32) != 0)
496         {
497                 /* Carry not needed (rc->lower_bound < 0xffff0000), or carry
498                  * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit
499                  * is 1).  */
500                 do {
501                         if (likely(rc->next >= rc->begin)) {
502                                 if (rc->next != rc->end) {
503                                         put_unaligned_u16_le(rc->cache +
504                                                              (u16)(rc->lower_bound >> 32),
505                                                              rc->next++);
506                                 }
507                         } else {
508                                 rc->next++;
509                         }
510                         rc->cache = 0xffff;
511                 } while (--rc->cache_size != 0);
512
513                 rc->cache = (rc->lower_bound >> 16) & 0xffff;
514         }
515         ++rc->cache_size;
516         rc->lower_bound = (rc->lower_bound & 0xffff) << 16;
517 }
518
519 static bool
520 lzms_range_encoder_flush(struct lzms_range_encoder *rc)
521 {
522         for (int i = 0; i < 4; i++)
523                 lzms_range_encoder_shift_low(rc);
524         return rc->next != rc->end;
525 }
526
527 /*
528  * Encode the next bit using the range encoder.
529  *
530  * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
531  * bit is 0 rather than 1.
532  */
533 static inline void
534 lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
535 {
536         /* Normalize if needed.  */
537         if (rc->range_size <= 0xffff) {
538                 rc->range_size <<= 16;
539                 lzms_range_encoder_shift_low(rc);
540         }
541
542         u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob;
543         if (bit == 0) {
544                 rc->range_size = bound;
545         } else {
546                 rc->lower_bound += bound;
547                 rc->range_size -= bound;
548         }
549 }
550
551 /*
552  * Encode a bit.  This wraps around lzms_range_encode_bit() to handle using and
553  * updating the state and its corresponding probability entry.
554  */
555 static inline void
556 lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
557                 struct lzms_probability_entry *probs,
558                 struct lzms_range_encoder *rc)
559 {
560         struct lzms_probability_entry *prob_entry;
561         u32 prob;
562
563         /* Load the probability entry for the current state.  */
564         prob_entry = &probs[*state_p];
565
566         /* Update the state based on the next bit.  */
567         *state_p = ((*state_p << 1) | bit) & (num_states - 1);
568
569         /* Get the probability that the bit is 0.  */
570         prob = lzms_get_probability(prob_entry);
571
572         /* Update the probability entry.  */
573         lzms_update_probability_entry(prob_entry, bit);
574
575         /* Encode the bit using the range encoder.  */
576         lzms_range_encode_bit(rc, bit, prob);
577 }
578
579 /* Helper functions for encoding bits in the various decision classes  */
580
581 static void
582 lzms_encode_main_bit(struct lzms_compressor *c, int bit)
583 {
584         lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
585                         c->main_probs, &c->rc);
586 }
587
588 static void
589 lzms_encode_match_bit(struct lzms_compressor *c, int bit)
590 {
591         lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
592                         c->match_probs, &c->rc);
593 }
594
595 static void
596 lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
597 {
598         lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
599                         c->lz_probs, &c->rc);
600 }
601
602 static void
603 lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
604 {
605         lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
606                         c->lz_rep_probs[idx], &c->rc);
607 }
608
609 static void
610 lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
611 {
612         lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
613                         c->delta_probs, &c->rc);
614 }
615
616 static void
617 lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
618 {
619         lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
620                         c->delta_rep_probs[idx], &c->rc);
621 }
622
623 /******************************************************************************
624  *                   Huffman encoding and verbatim bits                       *
625  ******************************************************************************/
626
627 /*
628  * Initialize the output bitstream @os to write backwards to the specified
629  * buffer @out that is @count 16-bit integers long.
630  */
631 static void
632 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
633                            le16 *out, size_t count)
634 {
635         os->bitbuf = 0;
636         os->bitcount = 0;
637         os->next = out + count;
638         os->begin = out;
639 }
640
641 /*
642  * Write some bits, contained in the low-order @num_bits bits of @bits, to the
643  * output bitstream @os.
644  *
645  * @max_num_bits is a compile-time constant that specifies the maximum number of
646  * bits that can ever be written at this call site.
647  */
648 static inline void
649 lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
650                 const unsigned num_bits, const unsigned max_num_bits)
651 {
652         /* Add the bits to the bit buffer variable.  */
653         os->bitcount += num_bits;
654         os->bitbuf = (os->bitbuf << num_bits) | bits;
655
656         /* Check whether any coding units need to be written.  */
657         while (os->bitcount >= 16) {
658
659                 os->bitcount -= 16;
660
661                 /* Write a coding unit, unless it would underflow the buffer. */
662                 if (os->next != os->begin)
663                         put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next);
664
665                 /* Optimization for call sites that never write more than 16
666                  * bits at once.  */
667                 if (max_num_bits <= 16)
668                         break;
669         }
670 }
671
672 /*
673  * Flush the output bitstream, ensuring that all bits written to it have been
674  * written to memory.  Return %true if all bits have been output successfully,
675  * or %false if an overrun occurred.
676  */
677 static bool
678 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
679 {
680         if (os->next == os->begin)
681                 return false;
682
683         if (os->bitcount != 0)
684                 put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next);
685
686         return true;
687 }
688
689 static void
690 lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
691 {
692         make_canonical_huffman_code(rebuild_info->num_syms,
693                                     LZMS_MAX_CODEWORD_LENGTH,
694                                     rebuild_info->freqs,
695                                     rebuild_info->lens,
696                                     rebuild_info->codewords);
697         rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq;
698 }
699
700 static void
701 lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info,
702                        unsigned num_syms, unsigned rebuild_freq,
703                        u32 *codewords, u8 *lens, u32 *freqs)
704 {
705         rebuild_info->num_syms = num_syms;
706         rebuild_info->rebuild_freq = rebuild_freq;
707         rebuild_info->codewords = codewords;
708         rebuild_info->lens = lens;
709         rebuild_info->freqs = freqs;
710         lzms_init_symbol_frequencies(freqs, num_syms);
711         lzms_build_huffman_code(rebuild_info);
712 }
713
714 static void
715 lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
716 {
717         lzms_build_huffman_code(rebuild_info);
718         lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms);
719 }
720
721 /*
722  * Encode a symbol using the specified Huffman code.  Then, if the Huffman code
723  * needs to be rebuilt, rebuild it and return true; otherwise return false.
724  */
725 static inline bool
726 lzms_huffman_encode_symbol(unsigned sym,
727                            const u32 *codewords, const u8 *lens, u32 *freqs,
728                            struct lzms_output_bitstream *os,
729                            struct lzms_huffman_rebuild_info *rebuild_info)
730 {
731         lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH);
732         ++freqs[sym];
733         if (--rebuild_info->num_syms_until_rebuild == 0) {
734                 lzms_rebuild_huffman_code(rebuild_info);
735                 return true;
736         }
737         return false;
738 }
739
740 /* Helper routines to encode symbols using the various Huffman codes  */
741
742 static bool
743 lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym)
744 {
745         return lzms_huffman_encode_symbol(sym, c->literal_codewords,
746                                           c->literal_lens, c->literal_freqs,
747                                           &c->os, &c->literal_rebuild_info);
748 }
749
750 static bool
751 lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym)
752 {
753         return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords,
754                                           c->lz_offset_lens, c->lz_offset_freqs,
755                                           &c->os, &c->lz_offset_rebuild_info);
756 }
757
758 static bool
759 lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym)
760 {
761         return lzms_huffman_encode_symbol(sym, c->length_codewords,
762                                           c->length_lens, c->length_freqs,
763                                           &c->os, &c->length_rebuild_info);
764 }
765
766 static bool
767 lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym)
768 {
769         return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords,
770                                           c->delta_offset_lens, c->delta_offset_freqs,
771                                           &c->os, &c->delta_offset_rebuild_info);
772 }
773
774 static bool
775 lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym)
776 {
777         return lzms_huffman_encode_symbol(sym, c->delta_power_codewords,
778                                           c->delta_power_lens, c->delta_power_freqs,
779                                           &c->os, &c->delta_power_rebuild_info);
780 }
781
782 static void
783 lzms_update_fast_length_costs(struct lzms_compressor *c);
784
785 /*
786  * Encode a match length.  If this causes the Huffman code for length symbols to
787  * be rebuilt, also update the length costs array used by the parser.
788  */
789 static void
790 lzms_encode_length(struct lzms_compressor *c, u32 length)
791 {
792         unsigned slot = lzms_comp_get_length_slot(c, length);
793
794         if (lzms_encode_length_symbol(c, slot))
795                 lzms_update_fast_length_costs(c);
796
797         lzms_write_bits(&c->os, length - lzms_length_slot_base[slot],
798                         lzms_extra_length_bits[slot],
799                         LZMS_MAX_EXTRA_LENGTH_BITS);
800 }
801
802 /* Encode the offset of an LZ match.  */
803 static void
804 lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset)
805 {
806         unsigned slot = lzms_comp_get_offset_slot(c, offset);
807
808         lzms_encode_lz_offset_symbol(c, slot);
809         lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot],
810                         lzms_extra_offset_bits[slot],
811                         LZMS_MAX_EXTRA_OFFSET_BITS);
812 }
813
814 /* Encode the raw offset of a delta match.  */
815 static void
816 lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset)
817 {
818         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
819
820         lzms_encode_delta_offset_symbol(c, slot);
821         lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot],
822                         lzms_extra_offset_bits[slot],
823                         LZMS_MAX_EXTRA_OFFSET_BITS);
824 }
825
826 /******************************************************************************
827  *                             Item encoding                                  *
828  ******************************************************************************/
829
830 /* Encode the specified item, which may be a literal or any type of match.  */
831 static void
832 lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source)
833 {
834         /* Main bit: 0 = literal, 1 = match  */
835         int main_bit = (length > 1);
836         lzms_encode_main_bit(c, main_bit);
837
838         if (!main_bit) {
839                 /* Literal  */
840                 unsigned literal = source;
841                 lzms_encode_literal_symbol(c, literal);
842         } else {
843                 /* Match  */
844
845                 /* Match bit: 0 = LZ match, 1 = delta match  */
846                 int match_bit = (source & DELTA_SOURCE_TAG) != 0;
847                 lzms_encode_match_bit(c, match_bit);
848
849                 if (!match_bit) {
850                         /* LZ match  */
851
852                         /* LZ bit: 0 = explicit offset, 1 = repeat offset  */
853                         int lz_bit = (source < LZMS_NUM_LZ_REPS);
854                         lzms_encode_lz_bit(c, lz_bit);
855
856                         if (!lz_bit) {
857                                 /* Explicit offset LZ match  */
858                                 u32 offset = source - (LZMS_NUM_LZ_REPS - 1);
859                                 lzms_encode_lz_offset(c, offset);
860                         } else {
861                                 /* Repeat offset LZ match  */
862                                 int rep_idx = source;
863                                 for (int i = 0; i < rep_idx; i++)
864                                         lzms_encode_lz_rep_bit(c, 1, i);
865                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
866                                         lzms_encode_lz_rep_bit(c, 0, rep_idx);
867                         }
868                 } else {
869                         /* Delta match  */
870
871                         source &= ~DELTA_SOURCE_TAG;
872
873                         /* Delta bit: 0 = explicit offset, 1 = repeat offset  */
874                         int delta_bit = (source < LZMS_NUM_DELTA_REPS);
875                         lzms_encode_delta_bit(c, delta_bit);
876
877                         if (!delta_bit) {
878                                 /* Explicit offset delta match  */
879                                 u32 power = source >> DELTA_SOURCE_POWER_SHIFT;
880                                 u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) -
881                                                  (LZMS_NUM_DELTA_REPS - 1);
882                                 lzms_encode_delta_power_symbol(c, power);
883                                 lzms_encode_delta_raw_offset(c, raw_offset);
884                         } else {
885                                 /* Repeat offset delta match  */
886                                 int rep_idx = source;
887                                 for (int i = 0; i < rep_idx; i++)
888                                         lzms_encode_delta_rep_bit(c, 1, i);
889                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
890                                         lzms_encode_delta_rep_bit(c, 0, rep_idx);
891                         }
892                 }
893
894                 /* Match length (encoded the same way for any match type)  */
895                 lzms_encode_length(c, length);
896         }
897 }
898
899 /* Encode a list of matches and literals chosen by the parsing algorithm.  */
900 static void
901 lzms_encode_nonempty_item_list(struct lzms_compressor *c,
902                                struct lzms_optimum_node *end_node)
903 {
904         /* Since we've stored at each node the item we took to arrive at that
905          * node, we can trace our chosen path in backwards order.  However, for
906          * encoding we need to trace our chosen path in forwards order.  To make
907          * this possible, the following loop moves the items from their
908          * destination nodes to their source nodes, which effectively reverses
909          * the path.  (Think of it like reversing a singly-linked list.)  */
910         struct lzms_optimum_node *cur_node = end_node;
911         struct lzms_item saved_item = cur_node->item;
912         do {
913                 struct lzms_item item = saved_item;
914                 if (cur_node->num_extra_items > 0) {
915                         /* Handle an arrival via multi-item lookahead.  */
916                         unsigned i = 0;
917                         struct lzms_optimum_node *orig_node = cur_node;
918                         do {
919                                 cur_node -= item.length;
920                                 cur_node->item = item;
921                                 item = orig_node->extra_items[i];
922                         } while (++i != orig_node->num_extra_items);
923                 }
924                 cur_node -= item.length;
925                 saved_item = cur_node->item;
926                 cur_node->item = item;
927         } while (cur_node != c->optimum_nodes);
928
929         /* Now trace the chosen path in forwards order, encoding each item.  */
930         do {
931                 lzms_encode_item(c, cur_node->item.length, cur_node->item.source);
932                 cur_node += cur_node->item.length;
933         } while (cur_node != end_node);
934 }
935
936 static inline void
937 lzms_encode_item_list(struct lzms_compressor *c,
938                       struct lzms_optimum_node *end_node)
939 {
940         if (end_node != c->optimum_nodes)
941                 lzms_encode_nonempty_item_list(c, end_node);
942 }
943
944 /******************************************************************************
945  *                             Cost evalution                                 *
946  ******************************************************************************/
947
948 /*
949  * If p is the predicted probability of the next bit being a 0, then the number
950  * of bits required to encode a 0 bit using a binary range encoder is the real
951  * number -log2(p), and the number of bits required to encode a 1 bit is the
952  * real number -log2(1 - p).  To avoid computing either of these expressions at
953  * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from
954  * probability to cost for each possible probability.  Specifically, the array
955  * indices are the numerators of the possible probabilities in LZMS, where the
956  * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the
957  * bit costs multiplied by 1<<COST_SHIFT and rounded to the nearest integer.
958  * Furthermore, the values stored for 0% and 100% probabilities are equal to the
959  * adjacent values, since these probabilities are not actually permitted.  This
960  * allows us to use the num_recent_zero_bits value from the
961  * lzms_probability_entry as the array index without fixing up these two special
962  * cases.
963  */
964 static const u32 lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR + 1] = {
965         384, 384, 320, 283, 256, 235, 219, 204,
966         192, 181, 171, 163, 155, 147, 140, 134,
967         128, 122, 117, 112, 107, 103, 99,  94,
968         91,  87,  83,  80,  76,  73,  70,  67,
969         64,  61,  58,  56,  53,  51,  48,  46,
970         43,  41,  39,  37,  35,  33,  30,  29,
971         27,  25,  23,  21,  19,  17,  16,  14,
972         12,  11,  9,   8,   6,   4,   3,   1,
973         1
974 };
975
976 static inline void
977 check_cost_shift(void)
978 {
979         /* lzms_bit_costs is hard-coded to the current COST_SHIFT.  */
980         BUILD_BUG_ON(COST_SHIFT != 6);
981 }
982
983 #if 0
984 #include <math.h>
985
986 static void
987 lzms_compute_bit_costs(void)
988 {
989         for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) {
990                 u32 prob = i;
991                 if (prob == 0)
992                         prob++;
993                 else if (prob == LZMS_PROBABILITY_DENOMINATOR)
994                         prob--;
995
996                 lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) *
997                                           (1 << COST_SHIFT));
998         }
999 }
1000 #endif
1001
1002 /* Return the cost to encode a 0 bit in the specified context.  */
1003 static inline u32
1004 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
1005 {
1006         return lzms_bit_costs[probs[state].num_recent_zero_bits];
1007 }
1008
1009 /* Return the cost to encode a 1 bit in the specified context.  */
1010 static inline u32
1011 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
1012 {
1013         return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
1014                               probs[state].num_recent_zero_bits];
1015 }
1016
1017 /* Return the cost to encode a literal, including the main bit.  */
1018 static inline u32
1019 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
1020 {
1021         return lzms_bit_0_cost(main_state, c->main_probs) +
1022                 ((u32)c->literal_lens[literal] << COST_SHIFT);
1023 }
1024
1025 /* Update 'fast_length_cost_tab' to use the latest Huffman code.  */
1026 static void
1027 lzms_update_fast_length_costs(struct lzms_compressor *c)
1028 {
1029         int slot = -1;
1030         u32 cost = 0;
1031         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
1032                 if (len >= lzms_length_slot_base[slot + 1]) {
1033                         slot++;
1034                         cost = (u32)(c->length_lens[slot] +
1035                                      lzms_extra_length_bits[slot]) << COST_SHIFT;
1036                 }
1037                 c->fast_length_cost_tab[len] = cost;
1038         }
1039 }
1040
1041 /* Return the cost to encode the specified match length, which must not exceed
1042  * MAX_FAST_LENGTH.  */
1043 static inline u32
1044 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
1045 {
1046         return c->fast_length_cost_tab[length];
1047 }
1048
1049 /* Return the cost to encode the specified LZ match offset.  */
1050 static inline u32
1051 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
1052 {
1053         unsigned slot = lzms_comp_get_offset_slot(c, offset);
1054         u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot];
1055         return num_bits << COST_SHIFT;
1056 }
1057
1058 /* Return the cost to encode the specified delta power and raw offset.  */
1059 static inline u32
1060 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
1061 {
1062         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
1063         u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] +
1064                        lzms_extra_offset_bits[slot];
1065         return num_bits << COST_SHIFT;
1066 }
1067
1068 /******************************************************************************
1069  *                              Adaptive state                                *
1070  ******************************************************************************/
1071
1072 static void
1073 lzms_init_adaptive_state(struct lzms_adaptive_state *state)
1074 {
1075         for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++)
1076                 state->recent_lz_offsets[i] = i + 1;
1077         state->prev_lz_offset = 0;
1078         state->upcoming_lz_offset = 0;
1079
1080         for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++)
1081                 state->recent_delta_pairs[i] = i + 1;
1082         state->prev_delta_pair = 0;
1083         state->upcoming_delta_pair = 0;
1084
1085         state->main_state = 0;
1086         state->match_state = 0;
1087         state->lz_state = 0;
1088         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
1089                 state->lz_rep_states[i] = 0;
1090         state->delta_state = 0;
1091         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
1092                 state->delta_rep_states[i] = 0;
1093 }
1094
1095 /*
1096  * Update the LRU queues for match sources when advancing by one item.
1097  *
1098  * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more
1099  * complex because:
1100  *      - there are separate queues for LZ and delta matches
1101  *      - updates to the queues are delayed by one encoded item (this prevents
1102  *        sources from being bumped up to index 0 too early)
1103  */
1104 static void
1105 lzms_update_lru_queues(struct lzms_adaptive_state *state)
1106 {
1107         if (state->prev_lz_offset != 0) {
1108                 for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--)
1109                         state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i];
1110                 state->recent_lz_offsets[0] = state->prev_lz_offset;
1111         }
1112         state->prev_lz_offset = state->upcoming_lz_offset;
1113
1114         if (state->prev_delta_pair != 0) {
1115                 for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--)
1116                         state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i];
1117                 state->recent_delta_pairs[0] = state->prev_delta_pair;
1118         }
1119         state->prev_delta_pair = state->upcoming_delta_pair;
1120 }
1121
1122 static inline void
1123 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
1124 {
1125         *state_p = ((*state_p << 1) | bit) % num_states;
1126 }
1127
1128 static inline void
1129 lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
1130 {
1131         lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
1132 }
1133
1134 static inline void
1135 lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
1136 {
1137         lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
1138 }
1139
1140 static inline void
1141 lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
1142 {
1143         lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
1144 }
1145
1146 static inline void
1147 lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1148 {
1149         for (int i = 0; i < rep_idx; i++)
1150                 lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS);
1151
1152         if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1153                 lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
1154 }
1155
1156 static inline void
1157 lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
1158 {
1159         lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
1160 }
1161
1162 static inline void
1163 lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1164 {
1165         for (int i = 0; i < rep_idx; i++)
1166                 lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS);
1167
1168         if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1169                 lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS);
1170 }
1171
1172 /******************************************************************************
1173  *                              Matchfinding                                  *
1174  ******************************************************************************/
1175
1176 /* Note: this code just handles finding delta matches.  The code for finding LZ
1177  * matches is elsewhere.  */
1178
1179
1180 /* Initialize the delta matchfinder for a new input buffer.  */
1181 static void
1182 lzms_init_delta_matchfinder(struct lzms_compressor *c)
1183 {
1184         /* Set all entries to use an invalid power, which will never match.  */
1185         BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER >= (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
1186         memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
1187
1188         /* Initialize the next hash code for each power.  We can just use zeroes
1189          * initially; it doesn't really matter.  */
1190         for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++)
1191                 c->next_delta_hashes[i] = 0;
1192 }
1193
1194 /*
1195  * Compute a DELTA_HASH_ORDER-bit hash code for the first
1196  * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
1197  * delta context with the specified @span.
1198  */
1199 static inline u32
1200 lzms_delta_hash(const u8 *p, u32 span)
1201 {
1202         /* A delta match has a certain span and an offset that is a multiple of
1203          * that span.  To reduce wasted space we use a single combined hash
1204          * table for all spans and positions, but to minimize collisions we
1205          * include in the hash code computation the span and the low-order bits
1206          * of the current position.  */
1207
1208         BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3);
1209         u8 d0 = *(p + 0) - *(p + 0 - span);
1210         u8 d1 = *(p + 1) - *(p + 1 - span);
1211         u8 d2 = *(p + 2) - *(p + 2 - span);
1212         u32 v = ((span + ((u32)(uintptr_t)p & (span - 1))) << 24) |
1213                 ((u32)d2 << 16) | ((u32)d1 << 8) | d0;
1214         return lz_hash(v, DELTA_HASH_ORDER);
1215 }
1216
1217 /*
1218  * Given a match between @in_next and @matchptr in a delta context with the
1219  * specified @span and having the initial @len, extend the match as far as
1220  * possible, up to a limit of @max_len.
1221  */
1222 static inline u32
1223 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
1224                         u32 len, u32 max_len, u32 span)
1225 {
1226         while (len < max_len &&
1227                (u8)(*(in_next + len) - *(in_next + len - span)) ==
1228                (u8)(*(matchptr + len) - *(matchptr + len - span)))
1229         {
1230                 len++;
1231         }
1232         return len;
1233 }
1234
1235 static void
1236 lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
1237                                   const u8 *in_next, u32 count)
1238 {
1239         u32 pos = in_next - c->in_buffer;
1240         if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1))
1241                 return;
1242         do {
1243                 /* Update the hash table for each power.  */
1244                 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1245                         const u32 span = (u32)1 << power;
1246                         if (unlikely(pos < span))
1247                                 continue;
1248                         const u32 next_hash = lzms_delta_hash(in_next + 1, span);
1249                         const u32 hash = c->next_delta_hashes[power];
1250                         c->delta_hash_table[hash] =
1251                                 (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1252                         c->next_delta_hashes[power] = next_hash;
1253                         prefetch(&c->delta_hash_table[next_hash]);
1254                 }
1255         } while (in_next++, pos++, --count);
1256 }
1257
1258 /*
1259  * Skip the next @count bytes (don't search for matches at them).  @in_next
1260  * points to the first byte to skip.  The return value is @in_next + count.
1261  */
1262 static const u8 *
1263 lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
1264 {
1265         lcpit_matchfinder_skip_bytes(&c->mf, count);
1266         if (c->use_delta_matches)
1267                 lzms_delta_matchfinder_skip_bytes(c, in_next, count);
1268         return in_next + count;
1269 }
1270
1271 /******************************************************************************
1272  *                          "Near-optimal" parsing                            *
1273  ******************************************************************************/
1274
1275 /*
1276  * The main near-optimal parsing routine.
1277  *
1278  * Briefly, the algorithm does an approximate minimum-cost path search to find a
1279  * "near-optimal" sequence of matches and literals to output, based on the
1280  * current cost model.  The algorithm steps forward, position by position (byte
1281  * by byte), and updates the minimum cost path to reach each later position that
1282  * can be reached using a match or literal from the current position.  This is
1283  * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
1284  * the graph edges are possible matches/literals to code, and the cost of each
1285  * edge is the estimated number of bits that will be required to output the
1286  * corresponding match or literal.  But one difference is that we actually
1287  * compute the lowest-cost path in pieces, where each piece is terminated when
1288  * there are no choices to be made.
1289  *
1290  * The costs of literals and matches are estimated using the range encoder
1291  * states and the semi-adaptive Huffman codes.  Except for range encoding
1292  * states, costs are assumed to be constant throughout a single run of the
1293  * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data.  This
1294  * introduces a source of non-optimality because the probabilities and Huffman
1295  * codes can change over this part of the data.  And of course, there are
1296  * various other reasons why the result isn't optimal in terms of compression
1297  * ratio.
1298  */
1299 static void
1300 lzms_near_optimal_parse(struct lzms_compressor *c)
1301 {
1302         const u8 *in_next = c->in_buffer;
1303         const u8 * const in_end = &c->in_buffer[c->in_nbytes];
1304         struct lzms_optimum_node *cur_node;
1305         struct lzms_optimum_node *end_node;
1306
1307         /* Set initial length costs for lengths <= MAX_FAST_LENGTH.  */
1308         lzms_update_fast_length_costs(c);
1309
1310         /* Set up the initial adaptive state.  */
1311         lzms_init_adaptive_state(&c->optimum_nodes[0].state);
1312
1313 begin:
1314         /* Start building a new list of items, which will correspond to the next
1315          * piece of the overall minimum-cost path.  */
1316
1317         cur_node = c->optimum_nodes;
1318         cur_node->cost = 0;
1319         end_node = cur_node;
1320
1321         if (in_next == in_end)
1322                 return;
1323
1324         /* The following loop runs once for each per byte in the input buffer,
1325          * except in a few shortcut cases.  */
1326         for (;;) {
1327                 u32 num_matches;
1328
1329                 /* Repeat offset LZ matches  */
1330                 if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS &&
1331                            in_end - in_next >= 2))
1332                 {
1333                         for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) {
1334
1335                                 /* Looking for a repeat offset LZ match at queue
1336                                  * index @rep_idx  */
1337
1338                                 const u32 offset = cur_node->state.recent_lz_offsets[rep_idx];
1339                                 const u8 * const matchptr = in_next - offset;
1340
1341                                 /* Check the first 2 bytes before entering the extension loop.  */
1342                                 if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr))
1343                                         continue;
1344
1345                                 /* Extend the match to its full length.  */
1346                                 const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next);
1347
1348                                 /* Early out for long repeat offset LZ match */
1349                                 if (rep_len >= c->mf.nice_match_len) {
1350
1351                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1352
1353                                         lzms_encode_item_list(c, cur_node);
1354                                         lzms_encode_item(c, rep_len, rep_idx);
1355
1356                                         c->optimum_nodes[0].state = cur_node->state;
1357                                         cur_node = &c->optimum_nodes[0];
1358
1359                                         cur_node->state.upcoming_lz_offset =
1360                                                 cur_node->state.recent_lz_offsets[rep_idx];
1361                                         cur_node->state.upcoming_delta_pair = 0;
1362                                         for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1363                                                 cur_node->state.recent_lz_offsets[i] =
1364                                                         cur_node->state.recent_lz_offsets[i + 1];
1365                                         lzms_update_lru_queues(&cur_node->state);
1366                                         lzms_update_main_state(&cur_node->state, 1);
1367                                         lzms_update_match_state(&cur_node->state, 0);
1368                                         lzms_update_lz_state(&cur_node->state, 1);
1369                                         lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1370                                         goto begin;
1371                                 }
1372
1373                                 while (end_node < cur_node + rep_len)
1374                                         (++end_node)->cost = INFINITE_COST;
1375
1376                                 u32 base_cost = cur_node->cost +
1377                                                 lzms_bit_1_cost(cur_node->state.main_state,
1378                                                                 c->main_probs) +
1379                                                 lzms_bit_0_cost(cur_node->state.match_state,
1380                                                                 c->match_probs) +
1381                                                 lzms_bit_1_cost(cur_node->state.lz_state,
1382                                                                 c->lz_probs);
1383
1384                                 for (int i = 0; i < rep_idx; i++)
1385                                         base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
1386                                                                      c->lz_rep_probs[i]);
1387
1388                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1389                                         base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
1390                                                                      c->lz_rep_probs[rep_idx]);
1391
1392                                 u32 len = 2;
1393                                 do {
1394                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1395                                         if (cost < (cur_node + len)->cost) {
1396                                                 (cur_node + len)->cost = cost;
1397                                                 (cur_node + len)->item = (struct lzms_item) {
1398                                                         .length = len,
1399                                                         .source = rep_idx,
1400                                                 };
1401                                                 (cur_node + len)->num_extra_items = 0;
1402                                         }
1403                                 } while (++len <= rep_len);
1404
1405
1406                                 /* try LZ-rep + lit + LZ-rep0  */
1407                                 if (c->try_lzrep_lit_lzrep0 &&
1408                                     in_end - (in_next + rep_len) >= 3 &&
1409                                     load_u16_unaligned(in_next + rep_len + 1) ==
1410                                     load_u16_unaligned(matchptr + rep_len + 1))
1411                                 {
1412                                         const u32 rep0_len = lz_extend(in_next + rep_len + 1,
1413                                                                        matchptr + rep_len + 1,
1414                                                                        2,
1415                                                                        min(c->mf.nice_match_len,
1416                                                                            in_end - (in_next + rep_len + 1)));
1417
1418                                         unsigned main_state = cur_node->state.main_state;
1419                                         unsigned match_state = cur_node->state.match_state;
1420                                         unsigned lz_state = cur_node->state.lz_state;
1421                                         unsigned lz_rep0_state = cur_node->state.lz_rep_states[0];
1422
1423                                         /* update states for LZ-rep  */
1424                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1425                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1426                                         lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS;
1427                                         lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) %
1428                                                                 LZMS_NUM_LZ_REP_PROBS;
1429
1430                                         /* LZ-rep cost  */
1431                                         u32 cost = base_cost + lzms_fast_length_cost(c, rep_len);
1432
1433                                         /* add literal cost  */
1434                                         cost += lzms_literal_cost(c, main_state, *(in_next + rep_len));
1435
1436                                         /* update state for literal  */
1437                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1438
1439                                         /* add LZ-rep0 cost  */
1440                                         cost += lzms_bit_1_cost(main_state, c->main_probs) +
1441                                                 lzms_bit_0_cost(match_state, c->match_probs) +
1442                                                 lzms_bit_1_cost(lz_state, c->lz_probs) +
1443                                                 lzms_bit_0_cost(lz_rep0_state, c->lz_rep_probs[0]) +
1444                                                 lzms_fast_length_cost(c, rep0_len);
1445
1446                                         const u32 total_len = rep_len + 1 + rep0_len;
1447
1448                                         while (end_node < cur_node + total_len)
1449                                                 (++end_node)->cost = INFINITE_COST;
1450
1451                                         if (cost < (cur_node + total_len)->cost) {
1452                                                 (cur_node + total_len)->cost = cost;
1453                                                 (cur_node + total_len)->item = (struct lzms_item) {
1454                                                         .length = rep0_len,
1455                                                         .source = 0,
1456                                                 };
1457                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1458                                                         .length = 1,
1459                                                         .source = *(in_next + rep_len),
1460                                                 };
1461                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1462                                                         .length = rep_len,
1463                                                         .source = rep_idx,
1464                                                 };
1465                                                 (cur_node + total_len)->num_extra_items = 2;
1466                                         }
1467                                 }
1468                         }
1469                 }
1470
1471                 /* Repeat offset delta matches  */
1472                 if (c->use_delta_matches &&
1473                     likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 &&
1474                            (in_end - in_next >= 2)))
1475                 {
1476                         for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) {
1477
1478                                 /* Looking for a repeat offset delta match at
1479                                  * queue index @rep_idx  */
1480
1481                                 const u32 pair = cur_node->state.recent_delta_pairs[rep_idx];
1482                                 const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT;
1483                                 const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK;
1484                                 const u32 span = (u32)1 << power;
1485                                 const u32 offset = raw_offset << power;
1486                                 const u8 * const matchptr = in_next - offset;
1487
1488                                 /* Check the first 2 bytes before entering the
1489                                  * extension loop.  */
1490                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1491                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1492                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1493                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))))
1494                                         continue;
1495
1496                                 /* Extend the match to its full length.  */
1497                                 const u32 rep_len = lzms_extend_delta_match(in_next, matchptr,
1498                                                                             2, in_end - in_next,
1499                                                                             span);
1500
1501                                 /* Early out for long repeat offset delta match */
1502                                 if (rep_len >= c->mf.nice_match_len) {
1503
1504                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1505
1506                                         lzms_encode_item_list(c, cur_node);
1507                                         lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx);
1508
1509                                         c->optimum_nodes[0].state = cur_node->state;
1510                                         cur_node = &c->optimum_nodes[0];
1511
1512                                         cur_node->state.upcoming_delta_pair = pair;
1513                                         cur_node->state.upcoming_lz_offset = 0;
1514                                         for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1515                                                 cur_node->state.recent_delta_pairs[i] =
1516                                                         cur_node->state.recent_delta_pairs[i + 1];
1517                                         lzms_update_lru_queues(&cur_node->state);
1518                                         lzms_update_main_state(&cur_node->state, 1);
1519                                         lzms_update_match_state(&cur_node->state, 1);
1520                                         lzms_update_delta_state(&cur_node->state, 1);
1521                                         lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1522                                         goto begin;
1523                                 }
1524
1525                                 while (end_node < cur_node + rep_len)
1526                                         (++end_node)->cost = INFINITE_COST;
1527
1528                                 u32 base_cost = cur_node->cost +
1529                                                 lzms_bit_1_cost(cur_node->state.main_state,
1530                                                                 c->main_probs) +
1531                                                 lzms_bit_1_cost(cur_node->state.match_state,
1532                                                                 c->match_probs) +
1533                                                 lzms_bit_1_cost(cur_node->state.delta_state,
1534                                                                 c->delta_probs);
1535
1536                                 for (int i = 0; i < rep_idx; i++)
1537                                         base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
1538                                                                      c->delta_rep_probs[i]);
1539
1540                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1541                                         base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
1542                                                                      c->delta_rep_probs[rep_idx]);
1543
1544                                 u32 len = 2;
1545                                 do {
1546                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1547                                         if (cost < (cur_node + len)->cost) {
1548                                                 (cur_node + len)->cost = cost;
1549                                                 (cur_node + len)->item = (struct lzms_item) {
1550                                                         .length = len,
1551                                                         .source = DELTA_SOURCE_TAG | rep_idx,
1552                                                 };
1553                                                 (cur_node + len)->num_extra_items = 0;
1554                                         }
1555                                 } while (++len <= rep_len);
1556                         }
1557                 }
1558
1559                 /* Explicit offset LZ matches  */
1560                 num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches);
1561                 if (num_matches) {
1562
1563                         u32 best_len = c->matches[0].length;
1564
1565                         /* Early out for long explicit offset LZ match  */
1566                         if (best_len >= c->mf.nice_match_len) {
1567
1568                                 const u32 offset = c->matches[0].offset;
1569
1570                                 /* Extend the match as far as possible.
1571                                  * This is necessary because the LCP-interval
1572                                  * tree matchfinder only reports up to
1573                                  * nice_match_len bytes.  */
1574                                 best_len = lz_extend(in_next, in_next - offset,
1575                                                      best_len, in_end - in_next);
1576
1577                                 in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1);
1578
1579                                 lzms_encode_item_list(c, cur_node);
1580                                 lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1);
1581
1582                                 c->optimum_nodes[0].state = cur_node->state;
1583                                 cur_node = &c->optimum_nodes[0];
1584
1585                                 cur_node->state.upcoming_lz_offset = offset;
1586                                 cur_node->state.upcoming_delta_pair = 0;
1587                                 lzms_update_lru_queues(&cur_node->state);
1588                                 lzms_update_main_state(&cur_node->state, 1);
1589                                 lzms_update_match_state(&cur_node->state, 0);
1590                                 lzms_update_lz_state(&cur_node->state, 0);
1591                                 goto begin;
1592                         }
1593
1594                         while (end_node < cur_node + best_len)
1595                                 (++end_node)->cost = INFINITE_COST;
1596
1597                         u32 base_cost = cur_node->cost +
1598                                         lzms_bit_1_cost(cur_node->state.main_state,
1599                                                         c->main_probs) +
1600                                         lzms_bit_0_cost(cur_node->state.match_state,
1601                                                         c->match_probs) +
1602                                         lzms_bit_0_cost(cur_node->state.lz_state,
1603                                                         c->lz_probs);
1604
1605                         if (c->try_lzmatch_lit_lzrep0 &&
1606                             likely(in_end - (in_next + c->matches[0].length) >= 3))
1607                         {
1608                                 /* try LZ-match + lit + LZ-rep0  */
1609
1610                                 u32 l = 2;
1611                                 u32 i = num_matches - 1;
1612                                 do {
1613                                         const u32 len = c->matches[i].length;
1614                                         const u32 offset = c->matches[i].offset;
1615                                         const u32 position_cost = base_cost +
1616                                                                   lzms_lz_offset_cost(c, offset);
1617                                         do {
1618                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1619                                                 if (cost < (cur_node + l)->cost) {
1620                                                         (cur_node + l)->cost = cost;
1621                                                         (cur_node + l)->item = (struct lzms_item) {
1622                                                                 .length = l,
1623                                                                 .source = offset + (LZMS_NUM_LZ_REPS - 1),
1624                                                         };
1625                                                         (cur_node + l)->num_extra_items = 0;
1626                                                 }
1627                                         } while (++l <= len);
1628
1629                                         const u8 * const matchptr = in_next - offset;
1630                                         if (load_u16_unaligned(matchptr + len + 1) !=
1631                                             load_u16_unaligned(in_next + len + 1))
1632                                                 continue;
1633
1634                                         const u32 rep0_len = lz_extend(in_next + len + 1,
1635                                                                        matchptr + len + 1,
1636                                                                        2,
1637                                                                        min(c->mf.nice_match_len,
1638                                                                            in_end - (in_next + len + 1)));
1639
1640                                         unsigned main_state = cur_node->state.main_state;
1641                                         unsigned match_state = cur_node->state.match_state;
1642                                         unsigned lz_state = cur_node->state.lz_state;
1643
1644                                         /* update states for LZ-match  */
1645                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1646                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1647                                         lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS;
1648
1649                                         /* LZ-match cost  */
1650                                         u32 cost = position_cost + lzms_fast_length_cost(c, len);
1651
1652                                         /* add literal cost  */
1653                                         cost += lzms_literal_cost(c, main_state, *(in_next + len));
1654
1655                                         /* update state for literal  */
1656                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1657
1658                                         /* add LZ-rep0 cost  */
1659                                         cost += lzms_bit_1_cost(main_state, c->main_probs) +
1660                                                 lzms_bit_0_cost(match_state, c->match_probs) +
1661                                                 lzms_bit_1_cost(lz_state, c->lz_probs) +
1662                                                 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1663                                                                 c->lz_rep_probs[0]) +
1664                                                 lzms_fast_length_cost(c, rep0_len);
1665
1666                                         const u32 total_len = len + 1 + rep0_len;
1667
1668                                         while (end_node < cur_node + total_len)
1669                                                 (++end_node)->cost = INFINITE_COST;
1670
1671                                         if (cost < (cur_node + total_len)->cost) {
1672                                                 (cur_node + total_len)->cost = cost;
1673                                                 (cur_node + total_len)->item = (struct lzms_item) {
1674                                                         .length = rep0_len,
1675                                                         .source = 0,
1676                                                 };
1677                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1678                                                         .length = 1,
1679                                                         .source = *(in_next + len),
1680                                                 };
1681                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1682                                                         .length = len,
1683                                                         .source = offset + LZMS_NUM_LZ_REPS - 1,
1684                                                 };
1685                                                 (cur_node + total_len)->num_extra_items = 2;
1686                                         }
1687                                 } while (i--);
1688                         } else {
1689                                 u32 l = 2;
1690                                 u32 i = num_matches - 1;
1691                                 do {
1692                                         u32 position_cost = base_cost +
1693                                                             lzms_lz_offset_cost(c, c->matches[i].offset);
1694                                         do {
1695                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1696                                                 if (cost < (cur_node + l)->cost) {
1697                                                         (cur_node + l)->cost = cost;
1698                                                         (cur_node + l)->item = (struct lzms_item) {
1699                                                                 .length = l,
1700                                                                 .source = c->matches[i].offset +
1701                                                                           (LZMS_NUM_LZ_REPS - 1),
1702                                                         };
1703                                                         (cur_node + l)->num_extra_items = 0;
1704                                                 }
1705                                         } while (++l <= c->matches[i].length);
1706                                 } while (i--);
1707                         }
1708                 }
1709
1710                 /* Explicit offset delta matches  */
1711                 if (c->use_delta_matches &&
1712                     likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1))
1713                 {
1714                         const u32 pos = in_next - c->in_buffer;
1715
1716                         /* Consider each possible power (log2 of span)  */
1717                         BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER > LZMS_NUM_DELTA_POWER_SYMS);
1718                         for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1719
1720                                 const u32 span = (u32)1 << power;
1721
1722                                 if (unlikely(pos < span))
1723                                         continue;
1724
1725                                 const u32 next_hash = lzms_delta_hash(in_next + 1, span);
1726                                 const u32 hash = c->next_delta_hashes[power];
1727                                 const u32 cur_match = c->delta_hash_table[hash];
1728
1729                                 c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1730                                 c->next_delta_hashes[power] = next_hash;
1731                                 prefetch(&c->delta_hash_table[next_hash]);
1732
1733                                 if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
1734                                         continue;
1735
1736                                 const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK);
1737
1738                                 /* The offset must be a multiple of span.  */
1739                                 if (offset & (span - 1))
1740                                         continue;
1741
1742                                 const u8 * const matchptr = in_next - offset;
1743
1744                                 /* Check the first 3 bytes before entering the
1745                                  * extension loop.  */
1746                                 BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3);
1747                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1748                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1749                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1750                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) ||
1751                                     ((u8)(*(in_next + 2) - *(in_next + 2 - span)) !=
1752                                      (u8)(*(matchptr + 2) - *(matchptr + 2 - span))))
1753                                         continue;
1754
1755                                 /* Extend the delta match to its full length.  */
1756                                 const u32 len = lzms_extend_delta_match(in_next,
1757                                                                         matchptr,
1758                                                                         3,
1759                                                                         in_end - in_next,
1760                                                                         span);
1761
1762                                 const u32 raw_offset = offset >> power;
1763                                 const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
1764                                                  raw_offset;
1765                                 const u32 source = DELTA_SOURCE_TAG |
1766                                                    (pair + LZMS_NUM_DELTA_REPS - 1);
1767
1768                                 /* Early out for long explicit offset delta match  */
1769                                 if (len >= c->mf.nice_match_len) {
1770
1771                                         in_next = lzms_skip_bytes(c, len - 1, in_next + 1);
1772
1773                                         lzms_encode_item_list(c, cur_node);
1774                                         lzms_encode_item(c, len, source);
1775
1776                                         c->optimum_nodes[0].state = cur_node->state;
1777                                         cur_node = &c->optimum_nodes[0];
1778
1779                                         cur_node->state.upcoming_lz_offset = 0;
1780                                         cur_node->state.upcoming_delta_pair = pair;
1781                                         lzms_update_lru_queues(&cur_node->state);
1782                                         lzms_update_main_state(&cur_node->state, 1);
1783                                         lzms_update_match_state(&cur_node->state, 1);
1784                                         lzms_update_delta_state(&cur_node->state, 0);
1785                                         goto begin;
1786                                 }
1787
1788                                 while (end_node < cur_node + len)
1789                                         (++end_node)->cost = INFINITE_COST;
1790
1791                                 u32 base_cost = cur_node->cost +
1792                                                 lzms_bit_1_cost(cur_node->state.main_state,
1793                                                                 c->main_probs) +
1794                                                 lzms_bit_1_cost(cur_node->state.match_state,
1795                                                                 c->match_probs) +
1796                                                 lzms_bit_0_cost(cur_node->state.delta_state,
1797                                                                 c->delta_probs) +
1798                                                 lzms_delta_source_cost(c, power, raw_offset);
1799
1800                                 u32 l = NBYTES_HASHED_FOR_DELTA;
1801                                 do {
1802                                         u32 cost = base_cost + lzms_fast_length_cost(c, l);
1803                                         if (cost < (cur_node + l)->cost) {
1804                                                 (cur_node + l)->cost = cost;
1805                                                 (cur_node + l)->item = (struct lzms_item) {
1806                                                         .length = l,
1807                                                         .source = source,
1808                                                 };
1809                                                 (cur_node + l)->num_extra_items = 0;
1810                                         }
1811                                 } while (++l <= len);
1812                         }
1813                 }
1814
1815                 /* Literal  */
1816                 if (end_node < cur_node + 1)
1817                         (++end_node)->cost = INFINITE_COST;
1818                 const u32 cur_and_lit_cost = cur_node->cost +
1819                                              lzms_literal_cost(c, cur_node->state.main_state,
1820                                                                *in_next);
1821                 if (cur_and_lit_cost < (cur_node + 1)->cost) {
1822                         (cur_node + 1)->cost = cur_and_lit_cost;
1823                         (cur_node + 1)->item = (struct lzms_item) {
1824                                 .length = 1,
1825                                 .source = *in_next,
1826                         };
1827                         (cur_node + 1)->num_extra_items = 0;
1828                 } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) {
1829                         /* try lit + LZ-rep0  */
1830                         const u32 offset =
1831                                 (cur_node->state.prev_lz_offset) ?
1832                                         cur_node->state.prev_lz_offset :
1833                                         cur_node->state.recent_lz_offsets[0];
1834
1835                         if (load_u16_unaligned(in_next + 1) ==
1836                             load_u16_unaligned(in_next + 1 - offset))
1837                         {
1838                                 const u32 rep0_len = lz_extend(in_next + 1,
1839                                                                in_next + 1 - offset,
1840                                                                2,
1841                                                                min(in_end - (in_next + 1),
1842                                                                    c->mf.nice_match_len));
1843
1844                                 unsigned main_state = cur_node->state.main_state;
1845
1846                                 /* Update main_state after literal  */
1847                                 main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS;
1848
1849                                 /* Add cost of LZ-rep0  */
1850                                 const u32 cost = cur_and_lit_cost +
1851                                                  lzms_bit_1_cost(main_state, c->main_probs) +
1852                                                  lzms_bit_0_cost(cur_node->state.match_state,
1853                                                                  c->match_probs) +
1854                                                  lzms_bit_1_cost(cur_node->state.lz_state,
1855                                                                  c->lz_probs) +
1856                                                  lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1857                                                                  c->lz_rep_probs[0]) +
1858                                                  lzms_fast_length_cost(c, rep0_len);
1859
1860                                 const u32 total_len = 1 + rep0_len;
1861
1862                                 while (end_node < cur_node + total_len)
1863                                         (++end_node)->cost = INFINITE_COST;
1864
1865                                 if (cost < (cur_node + total_len)->cost) {
1866                                         (cur_node + total_len)->cost = cost;
1867                                         (cur_node + total_len)->item = (struct lzms_item) {
1868                                                 .length = rep0_len,
1869                                                 .source = 0,
1870                                         };
1871                                         (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1872                                                 .length = 1,
1873                                                 .source = *in_next,
1874                                         };
1875                                         (cur_node + total_len)->num_extra_items = 1;
1876                                 }
1877                         }
1878                 }
1879
1880                 /* Advance to the next position.  */
1881                 in_next++;
1882                 cur_node++;
1883
1884                 /* The lowest-cost path to the current position is now known.
1885                  * Finalize the adaptive state that results from taking this
1886                  * lowest-cost path.  */
1887                 struct lzms_item item_to_take = cur_node->item;
1888                 struct lzms_optimum_node *source_node = cur_node - (item_to_take.length);
1889                 int next_item_idx = -1;
1890                 for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
1891                         item_to_take = cur_node->extra_items[i];
1892                         source_node -= item_to_take.length;
1893                         next_item_idx++;
1894                 }
1895                 cur_node->state = source_node->state;
1896                 for (;;) {
1897                         const u32 length = item_to_take.length;
1898                         u32 source = item_to_take.source;
1899
1900                         cur_node->state.upcoming_lz_offset = 0;
1901                         cur_node->state.upcoming_delta_pair = 0;
1902                         if (length > 1) {
1903                                 /* Match  */
1904
1905                                 lzms_update_main_state(&cur_node->state, 1);
1906
1907                                 if (source & DELTA_SOURCE_TAG) {
1908                                         /* Delta match  */
1909
1910                                         lzms_update_match_state(&cur_node->state, 1);
1911                                         source &= ~DELTA_SOURCE_TAG;
1912
1913                                         if (source >= LZMS_NUM_DELTA_REPS) {
1914                                                 /* Explicit offset delta match  */
1915                                                 u32 pair = source - (LZMS_NUM_DELTA_REPS - 1);
1916                                                 lzms_update_delta_state(&cur_node->state, 0);
1917                                                 cur_node->state.upcoming_delta_pair = pair;
1918                                         } else {
1919                                                 /* Repeat offset delta match  */
1920                                                 int rep_idx = source;
1921
1922                                                 lzms_update_delta_state(&cur_node->state, 1);
1923                                                 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1924
1925                                                 cur_node->state.upcoming_delta_pair =
1926                                                         cur_node->state.recent_delta_pairs[rep_idx];
1927
1928                                                 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1929                                                         cur_node->state.recent_delta_pairs[i] =
1930                                                                 cur_node->state.recent_delta_pairs[i + 1];
1931                                         }
1932                                 } else {
1933                                         lzms_update_match_state(&cur_node->state, 0);
1934
1935                                         if (source >= LZMS_NUM_LZ_REPS) {
1936                                                 /* Explicit offset LZ match  */
1937                                                 lzms_update_lz_state(&cur_node->state, 0);
1938                                                 cur_node->state.upcoming_lz_offset =
1939                                                         source - (LZMS_NUM_LZ_REPS - 1);
1940                                         } else {
1941                                                 /* Repeat offset LZ match  */
1942                                                 int rep_idx = source;
1943
1944                                                 lzms_update_lz_state(&cur_node->state, 1);
1945                                                 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1946
1947                                                 cur_node->state.upcoming_lz_offset =
1948                                                         cur_node->state.recent_lz_offsets[rep_idx];
1949
1950                                                 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1951                                                         cur_node->state.recent_lz_offsets[i] =
1952                                                                 cur_node->state.recent_lz_offsets[i + 1];
1953                                         }
1954                                 }
1955                         } else {
1956                                 /* Literal  */
1957                                 lzms_update_main_state(&cur_node->state, 0);
1958                         }
1959
1960                         lzms_update_lru_queues(&cur_node->state);
1961
1962                         if (next_item_idx < 0)
1963                                 break;
1964                         if (next_item_idx == 0)
1965                                 item_to_take = cur_node->item;
1966                         else
1967                                 item_to_take = cur_node->extra_items[next_item_idx - 1];
1968                         --next_item_idx;
1969                 }
1970
1971                 /*
1972                  * This loop will terminate when either of the following
1973                  * conditions is true:
1974                  *
1975                  * (1) cur_node == end_node
1976                  *
1977                  *      There are no paths that extend beyond the current
1978                  *      position.  In this case, any path to a later position
1979                  *      must pass through the current position, so we can go
1980                  *      ahead and choose the list of items that led to this
1981                  *      position.
1982                  *
1983                  * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]
1984                  *
1985                  *      This bounds the number of times the algorithm can step
1986                  *      forward before it is guaranteed to start choosing items.
1987                  *      This limits the memory usage.  It also guarantees that
1988                  *      the parser will not go too long without updating the
1989                  *      probability tables.
1990                  *
1991                  * Note: no check for end-of-buffer is needed because
1992                  * end-of-buffer will trigger condition (1).
1993                  */
1994                 if (cur_node == end_node ||
1995                     cur_node == &c->optimum_nodes[NUM_OPTIM_NODES])
1996                 {
1997                         lzms_encode_nonempty_item_list(c, cur_node);
1998                         c->optimum_nodes[0].state = cur_node->state;
1999                         goto begin;
2000                 }
2001         }
2002 }
2003
2004 static void
2005 lzms_init_states_and_probabilities(struct lzms_compressor *c)
2006 {
2007         c->main_state = 0;
2008         c->match_state = 0;
2009         c->lz_state = 0;
2010         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2011                 c->lz_rep_states[i] = 0;
2012         c->delta_state = 0;
2013         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2014                 c->delta_rep_states[i] = 0;
2015
2016         lzms_init_probability_entries(c->main_probs, LZMS_NUM_MAIN_PROBS);
2017         lzms_init_probability_entries(c->match_probs, LZMS_NUM_MATCH_PROBS);
2018         lzms_init_probability_entries(c->lz_probs, LZMS_NUM_LZ_PROBS);
2019         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2020                 lzms_init_probability_entries(c->lz_rep_probs[i], LZMS_NUM_LZ_REP_PROBS);
2021         lzms_init_probability_entries(c->delta_probs, LZMS_NUM_DELTA_PROBS);
2022         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2023                 lzms_init_probability_entries(c->delta_rep_probs[i], LZMS_NUM_DELTA_REP_PROBS);
2024 }
2025
2026 static void
2027 lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots)
2028 {
2029         lzms_init_huffman_code(&c->literal_rebuild_info,
2030                                LZMS_NUM_LITERAL_SYMS,
2031                                LZMS_LITERAL_CODE_REBUILD_FREQ,
2032                                c->literal_codewords,
2033                                c->literal_lens,
2034                                c->literal_freqs);
2035
2036         lzms_init_huffman_code(&c->lz_offset_rebuild_info,
2037                                num_offset_slots,
2038                                LZMS_LZ_OFFSET_CODE_REBUILD_FREQ,
2039                                c->lz_offset_codewords,
2040                                c->lz_offset_lens,
2041                                c->lz_offset_freqs);
2042
2043         lzms_init_huffman_code(&c->length_rebuild_info,
2044                                LZMS_NUM_LENGTH_SYMS,
2045                                LZMS_LENGTH_CODE_REBUILD_FREQ,
2046                                c->length_codewords,
2047                                c->length_lens,
2048                                c->length_freqs);
2049
2050         lzms_init_huffman_code(&c->delta_offset_rebuild_info,
2051                                num_offset_slots,
2052                                LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ,
2053                                c->delta_offset_codewords,
2054                                c->delta_offset_lens,
2055                                c->delta_offset_freqs);
2056
2057         lzms_init_huffman_code(&c->delta_power_rebuild_info,
2058                                LZMS_NUM_DELTA_POWER_SYMS,
2059                                LZMS_DELTA_POWER_CODE_REBUILD_FREQ,
2060                                c->delta_power_codewords,
2061                                c->delta_power_lens,
2062                                c->delta_power_freqs);
2063 }
2064
2065 /*
2066  * Flush the output streams, prepare the final compressed data, and return its
2067  * size in bytes.
2068  *
2069  * A return value of 0 indicates that the data could not be compressed to fit in
2070  * the available space.
2071  */
2072 static size_t
2073 lzms_finalize(struct lzms_compressor *c)
2074 {
2075         size_t num_forwards_units;
2076         size_t num_backwards_units;
2077
2078         /* Flush both the forwards and backwards streams, and make sure they
2079          * didn't cross each other and start overwriting each other's data.  */
2080         if (!lzms_output_bitstream_flush(&c->os))
2081                 return 0;
2082
2083         if (!lzms_range_encoder_flush(&c->rc))
2084                 return 0;
2085
2086         if (c->rc.next > c->os.next)
2087                 return 0;
2088
2089         /* Now the compressed buffer contains the data output by the forwards
2090          * bitstream, then empty space, then data output by the backwards
2091          * bitstream.  Move the data output by the backwards bitstream to be
2092          * adjacent to the data output by the forward bitstream, and calculate
2093          * the compressed size that this results in.  */
2094         num_forwards_units = c->rc.next - c->rc.begin;
2095         num_backwards_units = c->rc.end - c->os.next;
2096
2097         memmove(c->rc.next, c->os.next, num_backwards_units * sizeof(le16));
2098
2099         return (num_forwards_units + num_backwards_units) * sizeof(le16);
2100 }
2101
2102 static u64
2103 lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level)
2104 {
2105         u64 size = 0;
2106
2107         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2108                 return 0;
2109
2110         size += sizeof(struct lzms_compressor);
2111
2112         /* in_buffer */
2113         size += max_bufsize;
2114
2115         /* mf */
2116         size += lcpit_matchfinder_get_needed_memory(max_bufsize);
2117
2118         return size;
2119 }
2120
2121 static int
2122 lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
2123                        void **c_ret)
2124 {
2125         struct lzms_compressor *c;
2126         u32 nice_match_len;
2127
2128         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2129                 return WIMLIB_ERR_INVALID_PARAM;
2130
2131         c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64);
2132         if (!c)
2133                 goto oom0;
2134
2135         /* Scale nice_match_len with the compression level.  But to allow an
2136          * optimization for length cost calculations, don't allow nice_match_len
2137          * to exceed MAX_FAST_LENGTH.  */
2138         nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH);
2139
2140         c->use_delta_matches = (compression_level >= 35);
2141         c->try_lzmatch_lit_lzrep0 = (compression_level >= 45);
2142         c->try_lit_lzrep0 = (compression_level >= 60);
2143         c->try_lzrep_lit_lzrep0 = (compression_level >= 60);
2144
2145         c->in_buffer = MALLOC(max_bufsize);
2146         if (!c->in_buffer)
2147                 goto oom1;
2148
2149         if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len))
2150                 goto oom2;
2151
2152         lzms_init_fast_length_slot_tab(c);
2153         lzms_init_offset_slot_tabs(c);
2154
2155         *c_ret = c;
2156         return 0;
2157
2158 oom2:
2159         FREE(c->in_buffer);
2160 oom1:
2161         ALIGNED_FREE(c);
2162 oom0:
2163         return WIMLIB_ERR_NOMEM;
2164 }
2165
2166 static size_t
2167 lzms_compress(const void *in, size_t in_nbytes,
2168               void *out, size_t out_nbytes_avail, void *_c)
2169 {
2170         struct lzms_compressor *c = _c;
2171
2172         /* Don't bother trying to compress extremely small inputs.  */
2173         if (in_nbytes < 4)
2174                 return 0;
2175
2176         /* Copy the input data into the internal buffer and preprocess it.  */
2177         memcpy(c->in_buffer, in, in_nbytes);
2178         c->in_nbytes = in_nbytes;
2179         lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false);
2180
2181         /* Prepare the matchfinders.  */
2182         lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes);
2183         if (c->use_delta_matches)
2184                 lzms_init_delta_matchfinder(c);
2185
2186         /* Initialize the encoder structures.  */
2187         lzms_range_encoder_init(&c->rc, out, out_nbytes_avail / sizeof(le16));
2188         lzms_output_bitstream_init(&c->os, out, out_nbytes_avail / sizeof(le16));
2189         lzms_init_states_and_probabilities(c);
2190         lzms_init_huffman_codes(c, lzms_get_num_offset_slots(in_nbytes));
2191
2192         /* The main loop: parse and encode.  */
2193         lzms_near_optimal_parse(c);
2194
2195         /* Return the compressed data size or 0.  */
2196         return lzms_finalize(c);
2197 }
2198
2199 static void
2200 lzms_free_compressor(void *_c)
2201 {
2202         struct lzms_compressor *c = _c;
2203
2204         FREE(c->in_buffer);
2205         lcpit_matchfinder_destroy(&c->mf);
2206         ALIGNED_FREE(c);
2207 }
2208
2209 const struct compressor_ops lzms_compressor_ops = {
2210         .get_needed_memory  = lzms_get_needed_memory,
2211         .create_compressor  = lzms_create_compressor,
2212         .compress           = lzms_compress,
2213         .free_compressor    = lzms_free_compressor,
2214 };