]> wimlib.net Git - wimlib/blob - src/lzx_compress.c
LZX compressor improvements
[wimlib] / src / lzx_compress.c
1 /*
2  * lzx_compress.c
3  *
4  * A compressor for the LZX compression format, as used in WIM archives.
5  */
6
7 /*
8  * Copyright (C) 2012-2016 Eric Biggers
9  *
10  * This file is free software; you can redistribute it and/or modify it under
11  * the terms of the GNU Lesser General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option) any
13  * later version.
14  *
15  * This file is distributed in the hope that it will be useful, but WITHOUT
16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this file; if not, see http://www.gnu.org/licenses/.
22  */
23
24
25 /*
26  * This file contains a compressor for the LZX ("Lempel-Ziv eXtended")
27  * compression format, as used in the WIM (Windows IMaging) file format.
28  *
29  * Two different LZX-compatible algorithms are implemented: "near-optimal" and
30  * "lazy".  "Near-optimal" is significantly slower than "lazy", but results in a
31  * better compression ratio.  The "near-optimal" algorithm is used at the
32  * default compression level.
33  *
34  * This file may need some slight modifications to be used outside of the WIM
35  * format.  In particular, in other situations the LZX block header might be
36  * slightly different, and sliding window support might be required.
37  *
38  * LZX is a compression format derived from DEFLATE, the format used by zlib and
39  * gzip.  Both LZX and DEFLATE use LZ77 matching and Huffman coding.  Certain
40  * details are quite similar, such as the method for storing Huffman codes.
41  * However, the main differences are:
42  *
43  * - LZX preprocesses the data to attempt to make x86 machine code slightly more
44  *   compressible before attempting to compress it further.
45  *
46  * - LZX uses a "main" alphabet which combines literals and matches, with the
47  *   match symbols containing a "length header" (giving all or part of the match
48  *   length) and an "offset slot" (giving, roughly speaking, the order of
49  *   magnitude of the match offset).
50  *
51  * - LZX does not have static Huffman blocks (that is, the kind with preset
52  *   Huffman codes); however it does have two types of dynamic Huffman blocks
53  *   ("verbatim" and "aligned").
54  *
55  * - LZX has a minimum match length of 2 rather than 3.  Length 2 matches can be
56  *   useful, but generally only if the compressor is smart about choosing them.
57  *
58  * - In LZX, offset slots 0 through 2 actually represent entries in an LRU queue
59  *   of match offsets.  This is very useful for certain types of files, such as
60  *   binary files that have repeating records.
61  */
62
63 /******************************************************************************/
64 /*                            General parameters                              */
65 /*----------------------------------------------------------------------------*/
66
67 /*
68  * The compressor uses the faster algorithm at levels <= MAX_FAST_LEVEL.  It
69  * uses the slower algorithm at levels > MAX_FAST_LEVEL.
70  */
71 #define MAX_FAST_LEVEL                          34
72
73 /*
74  * The compressor-side limits on the codeword lengths (in bits) for each Huffman
75  * code.  To make outputting bits slightly faster, some of these limits are
76  * lower than the limits defined by the LZX format.  This does not significantly
77  * affect the compression ratio.
78  */
79 #define MAIN_CODEWORD_LIMIT                     16
80 #define LENGTH_CODEWORD_LIMIT                   12
81 #define ALIGNED_CODEWORD_LIMIT                  7
82 #define PRE_CODEWORD_LIMIT                      7
83
84
85 /******************************************************************************/
86 /*                         Block splitting parameters                         */
87 /*----------------------------------------------------------------------------*/
88
89 /*
90  * The compressor always outputs blocks of at least this size in bytes, except
91  * for the last block which may need to be smaller.
92  */
93 #define MIN_BLOCK_SIZE                          6500
94
95 /*
96  * The compressor attempts to end a block when it reaches this size in bytes.
97  * The final size might be slightly larger due to matches extending beyond the
98  * end of the block.  Specifically:
99  *
100  *  - The near-optimal compressor may choose a match of up to LZX_MAX_MATCH_LEN
101  *    bytes starting at the SOFT_MAX_BLOCK_SIZE'th byte.
102  *
103  *  - The lazy compressor may choose a sequence of literals starting at the
104  *    SOFT_MAX_BLOCK_SIZE'th byte when it sees a sequence of increasingly better
105  *    matches.  The final match may be up to LZX_MAX_MATCH_LEN bytes.  The
106  *    length of the literal sequence is approximately limited by the "nice match
107  *    length" parameter.
108  */
109 #define SOFT_MAX_BLOCK_SIZE                     100000
110
111 /*
112  * The number of observed items (matches and literals) that represents
113  * sufficient data for the compressor to decide whether the current block should
114  * be ended or not.
115  */
116 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK        400
117
118
119 /******************************************************************************/
120 /*                      Parameters for slower algorithm                       */
121 /*----------------------------------------------------------------------------*/
122
123 /*
124  * The log base 2 of the number of entries in the hash table for finding length
125  * 2 matches.  This could be as high as 16, but using a smaller hash table
126  * speeds up compression due to reduced cache pressure.
127  */
128 #define BT_MATCHFINDER_HASH2_ORDER              12
129
130 /*
131  * The number of lz_match structures in the match cache, excluding the extra
132  * "overflow" entries.  This value should be high enough so that nearly the
133  * time, all matches found in a given block can fit in the match cache.
134  * However, fallback behavior (immediately terminating the block) on cache
135  * overflow is still required.
136  */
137 #define CACHE_LENGTH                            (SOFT_MAX_BLOCK_SIZE * 5)
138
139 /*
140  * An upper bound on the number of matches that can ever be saved in the match
141  * cache for a single position.  Since each match we save for a single position
142  * has a distinct length, we can use the number of possible match lengths in LZX
143  * as this bound.  This bound is guaranteed to be valid in all cases, although
144  * if 'nice_match_length < LZX_MAX_MATCH_LEN', then it will never actually be
145  * reached.
146  */
147 #define MAX_MATCHES_PER_POS                     LZX_NUM_LENS
148
149 /*
150  * A scaling factor that makes it possible to consider fractional bit costs.  A
151  * single bit has a cost of (1 << COST_SHIFT).
152  *
153  * Note: this is only useful as a statistical trick for when the true costs are
154  * unknown.  In reality, each token in LZX requires a whole number of bits to
155  * output.
156  */
157 #define COST_SHIFT                              6
158
159 /*
160  * Should the compressor take into account the costs of aligned offset symbols?
161  */
162 #define CONSIDER_ALIGNED_COSTS                  1
163
164 /*
165  * Should the "minimum" cost path search algorithm consider "gap" matches, where
166  * a normal match is followed by a literal, then by a match with the same
167  * offset?  This is one specific, somewhat common situation in which the true
168  * minimum cost path is often different from the path found by looking only one
169  * edge ahead.
170  */
171 #define CONSIDER_GAP_MATCHES                    1
172
173 /******************************************************************************/
174 /*                                  Includes                                  */
175 /*----------------------------------------------------------------------------*/
176
177 #ifdef HAVE_CONFIG_H
178 #  include "config.h"
179 #endif
180
181 #include "wimlib/compress_common.h"
182 #include "wimlib/compressor_ops.h"
183 #include "wimlib/error.h"
184 #include "wimlib/lz_extend.h"
185 #include "wimlib/lzx_common.h"
186 #include "wimlib/unaligned.h"
187 #include "wimlib/util.h"
188
189 /* Note: BT_MATCHFINDER_HASH2_ORDER must be defined *before* including
190  * bt_matchfinder.h. */
191
192 /* Matchfinders with 16-bit positions  */
193 #define mf_pos_t        u16
194 #define MF_SUFFIX       _16
195 #include "wimlib/bt_matchfinder.h"
196 #include "wimlib/hc_matchfinder.h"
197
198 /* Matchfinders with 32-bit positions  */
199 #undef mf_pos_t
200 #undef MF_SUFFIX
201 #define mf_pos_t        u32
202 #define MF_SUFFIX       _32
203 #include "wimlib/bt_matchfinder.h"
204 #include "wimlib/hc_matchfinder.h"
205
206 /******************************************************************************/
207 /*                            Compressor structure                            */
208 /*----------------------------------------------------------------------------*/
209
210 /* Codewords for the Huffman codes */
211 struct lzx_codewords {
212         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
213         u32 len[LZX_LENCODE_NUM_SYMBOLS];
214         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
215 };
216
217 /*
218  * Codeword lengths, in bits, for the Huffman codes.
219  *
220  * A codeword length of 0 means the corresponding codeword has zero frequency.
221  *
222  * The main and length codes each have one extra entry for use as a sentinel.
223  * See lzx_write_compressed_code().
224  */
225 struct lzx_lens {
226         u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS + 1];
227         u8 len[LZX_LENCODE_NUM_SYMBOLS + 1];
228         u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
229 };
230
231 /* Codewords and lengths for the Huffman codes */
232 struct lzx_codes {
233         struct lzx_codewords codewords;
234         struct lzx_lens lens;
235 };
236
237 /* Symbol frequency counters for the Huffman-encoded alphabets */
238 struct lzx_freqs {
239         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
240         u32 len[LZX_LENCODE_NUM_SYMBOLS];
241         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
242 };
243
244 /* Block split statistics.  See the "Block splitting algorithm" section later in
245  * this file for details. */
246 #define NUM_LITERAL_OBSERVATION_TYPES 8
247 #define NUM_MATCH_OBSERVATION_TYPES 2
248 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
249                                NUM_MATCH_OBSERVATION_TYPES)
250 struct lzx_block_split_stats {
251         u32 new_observations[NUM_OBSERVATION_TYPES];
252         u32 observations[NUM_OBSERVATION_TYPES];
253         u32 num_new_observations;
254         u32 num_observations;
255 };
256
257 /*
258  * Represents a run of literals followed by a match or end-of-block.  This
259  * structure is needed to temporarily store items chosen by the compressor,
260  * since items cannot be written until all items for the block have been chosen
261  * and the block's Huffman codes have been computed.
262  */
263 struct lzx_sequence {
264
265         /* The number of literals in the run.  This may be 0.  The literals are
266          * not stored explicitly in this structure; instead, they are read
267          * directly from the uncompressed data.  */
268         u16 litrunlen;
269
270         /* If the next field doesn't indicate end-of-block, then this is the
271          * match length minus LZX_MIN_MATCH_LEN.  */
272         u16 adjusted_length;
273
274         /* If bit 31 is clear, then this field contains the match header in bits
275          * 0-8, and either the match offset plus LZX_OFFSET_ADJUSTMENT or a
276          * recent offset code in bits 9-30.  Otherwise (if bit 31 is set), this
277          * sequence's literal run was the last literal run in the block, so
278          * there is no match that follows it.  */
279         u32 adjusted_offset_and_match_hdr;
280 };
281
282 /*
283  * This structure represents a byte position in the input buffer and a node in
284  * the graph of possible match/literal choices.
285  *
286  * Logically, each incoming edge to this node is labeled with a literal or a
287  * match that can be taken to reach this position from an earlier position; and
288  * each outgoing edge from this node is labeled with a literal or a match that
289  * can be taken to advance from this position to a later position.
290  */
291 struct lzx_optimum_node {
292
293         /* The cost, in bits, of the lowest-cost path that has been found to
294          * reach this position.  This can change as progressively lower cost
295          * paths are found to reach this position.  */
296         u32 cost;
297
298         /*
299          * The match or literal that was taken to reach this position.  This can
300          * change as progressively lower cost paths are found to reach this
301          * position.
302          *
303          * For non-gap matches, this variable is divided into two bitfields
304          * whose meanings depend on the item type:
305          *
306          * Literals:
307          *      Low bits are 0, high bits are the literal.
308          *
309          * Explicit offset matches:
310          *      Low bits are the match length, high bits are the offset plus 2.
311          *
312          * Repeat offset matches:
313          *      Low bits are the match length, high bits are the queue index.
314          *
315          * For gap matches, identified by OPTIMUM_GAP_MATCH set, special
316          * behavior applies --- see the code.
317          */
318         u32 item;
319 #define OPTIMUM_OFFSET_SHIFT 9
320 #define OPTIMUM_LEN_MASK ((1 << OPTIMUM_OFFSET_SHIFT) - 1)
321 #if CONSIDER_GAP_MATCHES
322 #  define OPTIMUM_GAP_MATCH 0x80000000
323 #endif
324
325 } _aligned_attribute(8);
326
327 /* The cost model for near-optimal parsing */
328 struct lzx_costs {
329
330         /* 'match_cost[offset_slot][len - LZX_MIN_MATCH_LEN]' is the cost for a
331          * length 'len' match that has an offset belonging to 'offset_slot'. */
332         u16 match_cost[LZX_MAX_OFFSET_SLOTS][LZX_NUM_LENS];
333
334         /* Cost for each symbol in the main code */
335         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
336
337         /* Cost for each symbol in the length code */
338         u32 len[LZX_LENCODE_NUM_SYMBOLS];
339
340 #if CONSIDER_ALIGNED_COSTS
341         /* Cost for each symbol in the aligned code */
342         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
343 #endif
344 };
345
346 struct lzx_output_bitstream;
347
348 /* The main LZX compressor structure  */
349 struct lzx_compressor {
350
351         /* The buffer for preprocessed input data, if not using destructive
352          * compression */
353         void *in_buffer;
354
355         /* If true, the compressor need not preserve the input buffer if it
356          * compresses the data successfully.  */
357         bool destructive;
358
359         /* Pointer to the compress() implementation chosen at allocation time */
360         void (*impl)(struct lzx_compressor *, const u8 *, size_t,
361                      struct lzx_output_bitstream *);
362
363         /* The log base 2 of the window size for LZ match offset encoding
364          * purposes.  This will be >= LZX_MIN_WINDOW_ORDER and <=
365          * LZX_MAX_WINDOW_ORDER. */
366         unsigned window_order;
367
368         /* The number of symbols in the main alphabet.  This depends on the
369          * window order, since the window order determines the maximum possible
370          * match offset. */
371         unsigned num_main_syms;
372
373         /* The "nice" match length: if a match of this length is found, then it
374          * is chosen immediately without further consideration. */
375         unsigned nice_match_length;
376
377         /* The maximum search depth: at most this many potential matches are
378          * considered at each position. */
379         unsigned max_search_depth;
380
381         /* The number of optimization passes per block */
382         unsigned num_optim_passes;
383
384         /* The symbol frequency counters for the current block */
385         struct lzx_freqs freqs;
386
387         /* Block split statistics */
388         struct lzx_block_split_stats split_stats;
389
390         /* The Huffman codes for the current and previous blocks.  The one with
391          * index 'codes_index' is for the current block, and the other one is
392          * for the previous block. */
393         struct lzx_codes codes[2];
394         unsigned codes_index;
395
396         /* The matches and literals that the compressor has chosen for the
397          * current block.  The required length of this array is limited by the
398          * maximum number of matches that can ever be chosen for a single block,
399          * plus one for the special entry at the end. */
400         struct lzx_sequence chosen_sequences[
401                        DIV_ROUND_UP(SOFT_MAX_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1];
402
403         /* Tables for mapping adjusted offsets to offset slots */
404         u8 offset_slot_tab_1[32768]; /* offset slots [0, 29] */
405         u8 offset_slot_tab_2[128]; /* offset slots [30, 49] */
406
407         union {
408                 /* Data for lzx_compress_lazy() */
409                 struct {
410                         /* Hash chains matchfinder (MUST BE LAST!!!) */
411                         union {
412                                 struct hc_matchfinder_16 hc_mf_16;
413                                 struct hc_matchfinder_32 hc_mf_32;
414                         };
415                 };
416
417                 /* Data for lzx_compress_near_optimal() */
418                 struct {
419                         /*
420                          * Array of nodes, one per position, for running the
421                          * minimum-cost path algorithm.
422                          *
423                          * This array must be large enough to accommodate the
424                          * worst-case number of nodes, which occurs if the
425                          * compressor finds a match of length LZX_MAX_MATCH_LEN
426                          * at position 'SOFT_MAX_BLOCK_SIZE - 1', producing a
427                          * block of size 'SOFT_MAX_BLOCK_SIZE - 1 +
428                          * LZX_MAX_MATCH_LEN'.  Add one for the end-of-block
429                          * node.
430                          */
431                         struct lzx_optimum_node optimum_nodes[
432                                                     SOFT_MAX_BLOCK_SIZE - 1 +
433                                                     LZX_MAX_MATCH_LEN + 1];
434
435                         /* The cost model for the current optimization pass */
436                         struct lzx_costs costs;
437
438                         /*
439                          * Cached matches for the current block.  This array
440                          * contains the matches that were found at each position
441                          * in the block.  Specifically, for each position, there
442                          * is a special 'struct lz_match' whose 'length' field
443                          * contains the number of matches that were found at
444                          * that position; this is followed by the matches
445                          * themselves, if any, sorted by strictly increasing
446                          * length.
447                          *
448                          * Note: in rare cases, there will be a very high number
449                          * of matches in the block and this array will overflow.
450                          * If this happens, we force the end of the current
451                          * block.  CACHE_LENGTH is the length at which we
452                          * actually check for overflow.  The extra slots beyond
453                          * this are enough to absorb the worst case overflow,
454                          * which occurs if starting at &match_cache[CACHE_LENGTH
455                          * - 1], we write the match count header, then write
456                          * MAX_MATCHES_PER_POS matches, then skip searching for
457                          * matches at 'LZX_MAX_MATCH_LEN - 1' positions and
458                          * write the match count header for each.
459                          */
460                         struct lz_match match_cache[CACHE_LENGTH +
461                                                     MAX_MATCHES_PER_POS +
462                                                     LZX_MAX_MATCH_LEN - 1];
463
464                         /* Binary trees matchfinder (MUST BE LAST!!!) */
465                         union {
466                                 struct bt_matchfinder_16 bt_mf_16;
467                                 struct bt_matchfinder_32 bt_mf_32;
468                         };
469                 };
470         };
471 };
472
473 /******************************************************************************/
474 /*                            Matchfinder utilities                           */
475 /*----------------------------------------------------------------------------*/
476
477 /*
478  * Will a matchfinder using 16-bit positions be sufficient for compressing
479  * buffers of up to the specified size?  The limit could be 65536 bytes, but we
480  * also want to optimize out the use of offset_slot_tab_2 in the 16-bit case.
481  * This requires that the limit be no more than the length of offset_slot_tab_1
482  * (currently 32768).
483  */
484 static inline bool
485 lzx_is_16_bit(size_t max_bufsize)
486 {
487         STATIC_ASSERT(ARRAY_LEN(((struct lzx_compressor *)0)->offset_slot_tab_1) == 32768);
488         return max_bufsize <= 32768;
489 }
490
491 /*
492  * Return the offset slot for the specified adjusted match offset.
493  */
494 static inline unsigned
495 lzx_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset,
496                     bool is_16_bit)
497 {
498         if (is_16_bit || adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1))
499                 return c->offset_slot_tab_1[adjusted_offset];
500         return c->offset_slot_tab_2[adjusted_offset >> 14];
501 }
502
503 /*
504  * The following macros call either the 16-bit or the 32-bit version of a
505  * matchfinder function based on the value of 'is_16_bit', which will be known
506  * at compilation time.
507  */
508
509 #define CALL_HC_MF(is_16_bit, c, funcname, ...)                               \
510         ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->hc_mf_16, ##__VA_ARGS__) : \
511                        CONCAT(funcname, _32)(&(c)->hc_mf_32, ##__VA_ARGS__));
512
513 #define CALL_BT_MF(is_16_bit, c, funcname, ...)                               \
514         ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->bt_mf_16, ##__VA_ARGS__) : \
515                        CONCAT(funcname, _32)(&(c)->bt_mf_32, ##__VA_ARGS__));
516
517 /******************************************************************************/
518 /*                             Output bitstream                               */
519 /*----------------------------------------------------------------------------*/
520
521 /*
522  * The LZX bitstream is encoded as a sequence of little endian 16-bit coding
523  * units.  Bits are ordered from most significant to least significant within
524  * each coding unit.
525  */
526
527 /*
528  * Structure to keep track of the current state of sending bits to the
529  * compressed output buffer.
530  */
531 struct lzx_output_bitstream {
532
533         /* Bits that haven't yet been written to the output buffer */
534         machine_word_t bitbuf;
535
536         /* Number of bits currently held in @bitbuf */
537         machine_word_t bitcount;
538
539         /* Pointer to the start of the output buffer */
540         u8 *start;
541
542         /* Pointer to the position in the output buffer at which the next coding
543          * unit should be written */
544         u8 *next;
545
546         /* Pointer just past the end of the output buffer, rounded down to a
547          * 2-byte boundary */
548         u8 *end;
549 };
550
551 /* Can the specified number of bits always be added to 'bitbuf' after any
552  * pending 16-bit coding units have been flushed?  */
553 #define CAN_BUFFER(n)   ((n) <= WORDBITS - 15)
554
555 /* Initialize the output bitstream to write to the specified buffer. */
556 static void
557 lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
558 {
559         os->bitbuf = 0;
560         os->bitcount = 0;
561         os->start = buffer;
562         os->next = os->start;
563         os->end = os->start + (size & ~1);
564 }
565
566 /*
567  * Add some bits to the bitbuffer variable of the output bitstream.  The caller
568  * must make sure there is enough room.
569  */
570 static inline void
571 lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
572 {
573         os->bitbuf = (os->bitbuf << num_bits) | bits;
574         os->bitcount += num_bits;
575 }
576
577 /*
578  * Flush bits from the bitbuffer variable to the output buffer.  'max_num_bits'
579  * specifies the maximum number of bits that may have been added since the last
580  * flush.
581  */
582 static inline void
583 lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
584 {
585         /* Masking the number of bits to shift is only needed to avoid undefined
586          * behavior; we don't actually care about the results of bad shifts.  On
587          * x86, the explicit masking generates no extra code.  */
588         const u32 shift_mask = WORDBITS - 1;
589
590         if (os->end - os->next < 6)
591                 return;
592         put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) &
593                                             shift_mask), os->next + 0);
594         if (max_num_bits > 16)
595                 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) &
596                                                 shift_mask), os->next + 2);
597         if (max_num_bits > 32)
598                 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) &
599                                                 shift_mask), os->next + 4);
600         os->next += (os->bitcount >> 4) << 1;
601         os->bitcount &= 15;
602 }
603
604 /* Add at most 16 bits to the bitbuffer and flush it.  */
605 static inline void
606 lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
607 {
608         lzx_add_bits(os, bits, num_bits);
609         lzx_flush_bits(os, 16);
610 }
611
612 /*
613  * Flush the last coding unit to the output buffer if needed.  Return the total
614  * number of bytes written to the output buffer, or 0 if an overflow occurred.
615  */
616 static size_t
617 lzx_flush_output(struct lzx_output_bitstream *os)
618 {
619         if (os->end - os->next < 6)
620                 return 0;
621
622         if (os->bitcount != 0) {
623                 put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
624                 os->next += 2;
625         }
626
627         return os->next - os->start;
628 }
629
630 /******************************************************************************/
631 /*                           Preparing Huffman codes                          */
632 /*----------------------------------------------------------------------------*/
633
634 /*
635  * Build the Huffman codes.  This takes as input the frequency tables for each
636  * code and produces as output a set of tables that map symbols to codewords and
637  * codeword lengths.
638  */
639 static void
640 lzx_build_huffman_codes(struct lzx_compressor *c)
641 {
642         const struct lzx_freqs *freqs = &c->freqs;
643         struct lzx_codes *codes = &c->codes[c->codes_index];
644
645         STATIC_ASSERT(MAIN_CODEWORD_LIMIT >= 9 &&
646                       MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN);
647         make_canonical_huffman_code(c->num_main_syms,
648                                     MAIN_CODEWORD_LIMIT,
649                                     freqs->main,
650                                     codes->lens.main,
651                                     codes->codewords.main);
652
653         STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 8 &&
654                       LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN);
655         make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
656                                     LENGTH_CODEWORD_LIMIT,
657                                     freqs->len,
658                                     codes->lens.len,
659                                     codes->codewords.len);
660
661         STATIC_ASSERT(ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS &&
662                       ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN);
663         make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
664                                     ALIGNED_CODEWORD_LIMIT,
665                                     freqs->aligned,
666                                     codes->lens.aligned,
667                                     codes->codewords.aligned);
668 }
669
670 /* Reset the symbol frequencies for the current block. */
671 static void
672 lzx_reset_symbol_frequencies(struct lzx_compressor *c)
673 {
674         memset(&c->freqs, 0, sizeof(c->freqs));
675 }
676
677 static unsigned
678 lzx_compute_precode_items(const u8 lens[restrict],
679                           const u8 prev_lens[restrict],
680                           u32 precode_freqs[restrict],
681                           unsigned precode_items[restrict])
682 {
683         unsigned *itemptr;
684         unsigned run_start;
685         unsigned run_end;
686         unsigned extra_bits;
687         int delta;
688         u8 len;
689
690         itemptr = precode_items;
691         run_start = 0;
692
693         while (!((len = lens[run_start]) & 0x80)) {
694
695                 /* len = the length being repeated  */
696
697                 /* Find the next run of codeword lengths.  */
698
699                 run_end = run_start + 1;
700
701                 /* Fast case for a single length.  */
702                 if (likely(len != lens[run_end])) {
703                         delta = prev_lens[run_start] - len;
704                         if (delta < 0)
705                                 delta += 17;
706                         precode_freqs[delta]++;
707                         *itemptr++ = delta;
708                         run_start++;
709                         continue;
710                 }
711
712                 /* Extend the run.  */
713                 do {
714                         run_end++;
715                 } while (len == lens[run_end]);
716
717                 if (len == 0) {
718                         /* Run of zeroes.  */
719
720                         /* Symbol 18: RLE 20 to 51 zeroes at a time.  */
721                         while ((run_end - run_start) >= 20) {
722                                 extra_bits = min((run_end - run_start) - 20, 0x1f);
723                                 precode_freqs[18]++;
724                                 *itemptr++ = 18 | (extra_bits << 5);
725                                 run_start += 20 + extra_bits;
726                         }
727
728                         /* Symbol 17: RLE 4 to 19 zeroes at a time.  */
729                         if ((run_end - run_start) >= 4) {
730                                 extra_bits = min((run_end - run_start) - 4, 0xf);
731                                 precode_freqs[17]++;
732                                 *itemptr++ = 17 | (extra_bits << 5);
733                                 run_start += 4 + extra_bits;
734                         }
735                 } else {
736
737                         /* A run of nonzero lengths. */
738
739                         /* Symbol 19: RLE 4 to 5 of any length at a time.  */
740                         while ((run_end - run_start) >= 4) {
741                                 extra_bits = (run_end - run_start) > 4;
742                                 delta = prev_lens[run_start] - len;
743                                 if (delta < 0)
744                                         delta += 17;
745                                 precode_freqs[19]++;
746                                 precode_freqs[delta]++;
747                                 *itemptr++ = 19 | (extra_bits << 5) | (delta << 6);
748                                 run_start += 4 + extra_bits;
749                         }
750                 }
751
752                 /* Output any remaining lengths without RLE.  */
753                 while (run_start != run_end) {
754                         delta = prev_lens[run_start] - len;
755                         if (delta < 0)
756                                 delta += 17;
757                         precode_freqs[delta]++;
758                         *itemptr++ = delta;
759                         run_start++;
760                 }
761         }
762
763         return itemptr - precode_items;
764 }
765
766 /******************************************************************************/
767 /*                          Outputting compressed data                        */
768 /*----------------------------------------------------------------------------*/
769
770 /*
771  * Output a Huffman code in the compressed form used in LZX.
772  *
773  * The Huffman code is represented in the output as a logical series of codeword
774  * lengths from which the Huffman code, which must be in canonical form, can be
775  * reconstructed.
776  *
777  * The codeword lengths are themselves compressed using a separate Huffman code,
778  * the "precode", which contains a symbol for each possible codeword length in
779  * the larger code as well as several special symbols to represent repeated
780  * codeword lengths (a form of run-length encoding).  The precode is itself
781  * constructed in canonical form, and its codeword lengths are represented
782  * literally in 20 4-bit fields that immediately precede the compressed codeword
783  * lengths of the larger code.
784  *
785  * Furthermore, the codeword lengths of the larger code are actually represented
786  * as deltas from the codeword lengths of the corresponding code in the previous
787  * block.
788  *
789  * @os:
790  *      Bitstream to which to write the compressed Huffman code.
791  * @lens:
792  *      The codeword lengths, indexed by symbol, in the Huffman code.
793  * @prev_lens:
794  *      The codeword lengths, indexed by symbol, in the corresponding Huffman
795  *      code in the previous block, or all zeroes if this is the first block.
796  * @num_lens:
797  *      The number of symbols in the Huffman code.
798  */
799 static void
800 lzx_write_compressed_code(struct lzx_output_bitstream *os,
801                           const u8 lens[restrict],
802                           const u8 prev_lens[restrict],
803                           unsigned num_lens)
804 {
805         u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
806         u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
807         u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
808         unsigned precode_items[num_lens];
809         unsigned num_precode_items;
810         unsigned precode_item;
811         unsigned precode_sym;
812         unsigned i;
813         u8 saved = lens[num_lens];
814         *(u8 *)(lens + num_lens) = 0x80;
815
816         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
817                 precode_freqs[i] = 0;
818
819         /* Compute the "items" (RLE / literal tokens and extra bits) with which
820          * the codeword lengths in the larger code will be output.  */
821         num_precode_items = lzx_compute_precode_items(lens,
822                                                       prev_lens,
823                                                       precode_freqs,
824                                                       precode_items);
825
826         /* Build the precode.  */
827         STATIC_ASSERT(PRE_CODEWORD_LIMIT >= 5 &&
828                       PRE_CODEWORD_LIMIT <= LZX_MAX_PRE_CODEWORD_LEN);
829         make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS,
830                                     PRE_CODEWORD_LIMIT,
831                                     precode_freqs, precode_lens,
832                                     precode_codewords);
833
834         /* Output the lengths of the codewords in the precode.  */
835         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
836                 lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
837
838         /* Output the encoded lengths of the codewords in the larger code.  */
839         for (i = 0; i < num_precode_items; i++) {
840                 precode_item = precode_items[i];
841                 precode_sym = precode_item & 0x1F;
842                 lzx_add_bits(os, precode_codewords[precode_sym],
843                              precode_lens[precode_sym]);
844                 if (precode_sym >= 17) {
845                         if (precode_sym == 17) {
846                                 lzx_add_bits(os, precode_item >> 5, 4);
847                         } else if (precode_sym == 18) {
848                                 lzx_add_bits(os, precode_item >> 5, 5);
849                         } else {
850                                 lzx_add_bits(os, (precode_item >> 5) & 1, 1);
851                                 precode_sym = precode_item >> 6;
852                                 lzx_add_bits(os, precode_codewords[precode_sym],
853                                              precode_lens[precode_sym]);
854                         }
855                 }
856                 STATIC_ASSERT(CAN_BUFFER(2 * PRE_CODEWORD_LIMIT + 1));
857                 lzx_flush_bits(os, 2 * PRE_CODEWORD_LIMIT + 1);
858         }
859
860         *(u8 *)(lens + num_lens) = saved;
861 }
862
863 /*
864  * Write all matches and literal bytes (which were precomputed) in an LZX
865  * compressed block to the output bitstream in the final compressed
866  * representation.
867  *
868  * @os
869  *      The output bitstream.
870  * @block_type
871  *      The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
872  *      LZX_BLOCKTYPE_VERBATIM).
873  * @block_data
874  *      The uncompressed data of the block.
875  * @sequences
876  *      The matches and literals to output, given as a series of sequences.
877  * @codes
878  *      The main, length, and aligned offset Huffman codes for the current
879  *      LZX compressed block.
880  */
881 static void
882 lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
883                     const u8 *block_data, const struct lzx_sequence sequences[],
884                     const struct lzx_codes *codes)
885 {
886         const struct lzx_sequence *seq = sequences;
887         u32 ones_if_aligned = 0 - (block_type == LZX_BLOCKTYPE_ALIGNED);
888
889         for (;;) {
890                 /* Output the next sequence.  */
891
892                 unsigned litrunlen = seq->litrunlen;
893                 unsigned match_hdr;
894                 unsigned main_symbol;
895                 unsigned adjusted_length;
896                 u32 adjusted_offset;
897                 unsigned offset_slot;
898                 unsigned num_extra_bits;
899                 u32 extra_bits;
900
901                 /* Output the literal run of the sequence.  */
902
903                 if (litrunlen) {  /* Is the literal run nonempty?  */
904
905                         /* Verify optimization is enabled on 64-bit  */
906                         STATIC_ASSERT(WORDBITS < 64 ||
907                                       CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT));
908
909                         if (CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)) {
910
911                                 /* 64-bit: write 3 literals at a time.  */
912                                 while (litrunlen >= 3) {
913                                         unsigned lit0 = block_data[0];
914                                         unsigned lit1 = block_data[1];
915                                         unsigned lit2 = block_data[2];
916                                         lzx_add_bits(os, codes->codewords.main[lit0],
917                                                      codes->lens.main[lit0]);
918                                         lzx_add_bits(os, codes->codewords.main[lit1],
919                                                      codes->lens.main[lit1]);
920                                         lzx_add_bits(os, codes->codewords.main[lit2],
921                                                      codes->lens.main[lit2]);
922                                         lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
923                                         block_data += 3;
924                                         litrunlen -= 3;
925                                 }
926                                 if (litrunlen--) {
927                                         unsigned lit = *block_data++;
928                                         lzx_add_bits(os, codes->codewords.main[lit],
929                                                      codes->lens.main[lit]);
930                                         if (litrunlen--) {
931                                                 unsigned lit = *block_data++;
932                                                 lzx_add_bits(os, codes->codewords.main[lit],
933                                                              codes->lens.main[lit]);
934                                                 lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
935                                         } else {
936                                                 lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT);
937                                         }
938                                 }
939                         } else {
940                                 /* 32-bit: write 1 literal at a time.  */
941                                 do {
942                                         unsigned lit = *block_data++;
943                                         lzx_add_bits(os, codes->codewords.main[lit],
944                                                      codes->lens.main[lit]);
945                                         lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
946                                 } while (--litrunlen);
947                         }
948                 }
949
950                 /* Was this the last literal run?  */
951                 if (seq->adjusted_offset_and_match_hdr & 0x80000000)
952                         return;
953
954                 /* Nope; output the match.  */
955
956                 match_hdr = seq->adjusted_offset_and_match_hdr & 0x1FF;
957                 main_symbol = LZX_NUM_CHARS + match_hdr;
958                 adjusted_length = seq->adjusted_length;
959
960                 block_data += adjusted_length + LZX_MIN_MATCH_LEN;
961
962                 offset_slot = match_hdr / LZX_NUM_LEN_HEADERS;
963                 adjusted_offset = seq->adjusted_offset_and_match_hdr >> 9;
964
965                 num_extra_bits = lzx_extra_offset_bits[offset_slot];
966                 extra_bits = adjusted_offset - lzx_offset_slot_base[offset_slot];
967
968         #define MAX_MATCH_BITS  (MAIN_CODEWORD_LIMIT + LENGTH_CODEWORD_LIMIT + \
969                                  14 + ALIGNED_CODEWORD_LIMIT)
970
971                 /* Verify optimization is enabled on 64-bit  */
972                 STATIC_ASSERT(WORDBITS < 64 || CAN_BUFFER(MAX_MATCH_BITS));
973
974                 /* Output the main symbol for the match.  */
975
976                 lzx_add_bits(os, codes->codewords.main[main_symbol],
977                              codes->lens.main[main_symbol]);
978                 if (!CAN_BUFFER(MAX_MATCH_BITS))
979                         lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
980
981                 /* If needed, output the length symbol for the match.  */
982
983                 if (adjusted_length >= LZX_NUM_PRIMARY_LENS) {
984                         lzx_add_bits(os, codes->codewords.len[adjusted_length -
985                                                               LZX_NUM_PRIMARY_LENS],
986                                      codes->lens.len[adjusted_length -
987                                                      LZX_NUM_PRIMARY_LENS]);
988                         if (!CAN_BUFFER(MAX_MATCH_BITS))
989                                 lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
990                 }
991
992                 /* Output the extra offset bits for the match.  In aligned
993                  * offset blocks, the lowest 3 bits of the adjusted offset are
994                  * Huffman-encoded using the aligned offset code, provided that
995                  * there are at least extra 3 offset bits required.  All other
996                  * extra offset bits are output verbatim.  */
997
998                 if ((adjusted_offset & ones_if_aligned) >= 16) {
999
1000                         lzx_add_bits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS,
1001                                      num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS);
1002                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1003                                 lzx_flush_bits(os, 14);
1004
1005                         lzx_add_bits(os, codes->codewords.aligned[adjusted_offset &
1006                                                                   LZX_ALIGNED_OFFSET_BITMASK],
1007                                      codes->lens.aligned[adjusted_offset &
1008                                                          LZX_ALIGNED_OFFSET_BITMASK]);
1009                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1010                                 lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
1011                 } else {
1012                         STATIC_ASSERT(CAN_BUFFER(17));
1013
1014                         lzx_add_bits(os, extra_bits, num_extra_bits);
1015                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1016                                 lzx_flush_bits(os, 17);
1017                 }
1018
1019                 if (CAN_BUFFER(MAX_MATCH_BITS))
1020                         lzx_flush_bits(os, MAX_MATCH_BITS);
1021
1022                 /* Advance to the next sequence.  */
1023                 seq++;
1024         }
1025 }
1026
1027 static void
1028 lzx_write_compressed_block(const u8 *block_begin,
1029                            int block_type,
1030                            u32 block_size,
1031                            unsigned window_order,
1032                            unsigned num_main_syms,
1033                            const struct lzx_sequence sequences[],
1034                            const struct lzx_codes * codes,
1035                            const struct lzx_lens * prev_lens,
1036                            struct lzx_output_bitstream * os)
1037 {
1038         /* The first three bits indicate the type of block and are one of the
1039          * LZX_BLOCKTYPE_* constants.  */
1040         lzx_write_bits(os, block_type, 3);
1041
1042         /*
1043          * Output the block size.
1044          *
1045          * The original LZX format encoded the block size in 24 bits.  However,
1046          * the LZX format used in WIM archives uses 1 bit to specify whether the
1047          * block has the default size of 32768 bytes, then optionally 16 bits to
1048          * specify a non-default size.  This works fine for Microsoft's WIM
1049          * software (WIMGAPI), which never compresses more than 32768 bytes at a
1050          * time with LZX.  However, as an extension, our LZX compressor supports
1051          * compressing up to 2097152 bytes, with a corresponding increase in
1052          * window size.  It is possible for blocks in these larger buffers to
1053          * exceed 65535 bytes; such blocks cannot have their size represented in
1054          * 16 bits.
1055          *
1056          * The chosen solution was to use 24 bits for the block size when
1057          * possibly required --- specifically, when the compressor has been
1058          * allocated to be capable of compressing more than 32768 bytes at once.
1059          */
1060         if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
1061                 lzx_write_bits(os, 1, 1);
1062         } else {
1063                 lzx_write_bits(os, 0, 1);
1064
1065                 if (window_order >= 16)
1066                         lzx_write_bits(os, block_size >> 16, 8);
1067
1068                 lzx_write_bits(os, block_size & 0xFFFF, 16);
1069         }
1070
1071         /* If it's an aligned offset block, output the aligned offset code.  */
1072         if (block_type == LZX_BLOCKTYPE_ALIGNED) {
1073                 for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1074                         lzx_write_bits(os, codes->lens.aligned[i],
1075                                        LZX_ALIGNEDCODE_ELEMENT_SIZE);
1076                 }
1077         }
1078
1079         /* Output the main code (two parts).  */
1080         lzx_write_compressed_code(os, codes->lens.main,
1081                                   prev_lens->main,
1082                                   LZX_NUM_CHARS);
1083         lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
1084                                   prev_lens->main + LZX_NUM_CHARS,
1085                                   num_main_syms - LZX_NUM_CHARS);
1086
1087         /* Output the length code.  */
1088         lzx_write_compressed_code(os, codes->lens.len,
1089                                   prev_lens->len,
1090                                   LZX_LENCODE_NUM_SYMBOLS);
1091
1092         /* Output the compressed matches and literals.  */
1093         lzx_write_sequences(os, block_type, block_begin, sequences, codes);
1094 }
1095
1096 /*
1097  * Given the frequencies of symbols in an LZX-compressed block and the
1098  * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
1099  * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
1100  * will take fewer bits to output.
1101  */
1102 static int
1103 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
1104                                const struct lzx_codes * codes)
1105 {
1106         u32 aligned_cost = 0;
1107         u32 verbatim_cost = 0;
1108
1109         /* A verbatim block requires 3 bits in each place that an aligned offset
1110          * symbol would be used in an aligned offset block.  */
1111         for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1112                 verbatim_cost += LZX_NUM_ALIGNED_OFFSET_BITS * freqs->aligned[i];
1113                 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
1114         }
1115
1116         /* Account for the cost of sending the codeword lengths of the aligned
1117          * offset code.  */
1118         aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE *
1119                         LZX_ALIGNEDCODE_NUM_SYMBOLS;
1120
1121         if (aligned_cost < verbatim_cost)
1122                 return LZX_BLOCKTYPE_ALIGNED;
1123         else
1124                 return LZX_BLOCKTYPE_VERBATIM;
1125 }
1126
1127 /*
1128  * Flush an LZX block:
1129  *
1130  * 1. Build the Huffman codes.
1131  * 2. Decide whether to output the block as VERBATIM or ALIGNED.
1132  * 3. Write the block.
1133  * 4. Swap the indices of the current and previous Huffman codes.
1134  */
1135 static void
1136 lzx_flush_block(struct lzx_compressor *c, struct lzx_output_bitstream *os,
1137                 const u8 *block_begin, u32 block_size, u32 seq_idx)
1138 {
1139         int block_type;
1140
1141         lzx_build_huffman_codes(c);
1142
1143         block_type = lzx_choose_verbatim_or_aligned(&c->freqs,
1144                                                     &c->codes[c->codes_index]);
1145         lzx_write_compressed_block(block_begin,
1146                                    block_type,
1147                                    block_size,
1148                                    c->window_order,
1149                                    c->num_main_syms,
1150                                    &c->chosen_sequences[seq_idx],
1151                                    &c->codes[c->codes_index],
1152                                    &c->codes[c->codes_index ^ 1].lens,
1153                                    os);
1154         c->codes_index ^= 1;
1155 }
1156
1157 /******************************************************************************/
1158 /*                          Block splitting algorithm                         */
1159 /*----------------------------------------------------------------------------*/
1160
1161 /*
1162  * The problem of block splitting is to decide when it is worthwhile to start a
1163  * new block with new entropy codes.  There is a theoretically optimal solution:
1164  * recursively consider every possible block split, considering the exact cost
1165  * of each block, and choose the minimum cost approach.  But this is far too
1166  * slow.  Instead, as an approximation, we can count symbols and after every N
1167  * symbols, compare the expected distribution of symbols based on the previous
1168  * data with the actual distribution.  If they differ "by enough", then start a
1169  * new block.
1170  *
1171  * As an optimization and heuristic, we don't distinguish between every symbol
1172  * but rather we combine many symbols into a single "observation type".  For
1173  * literals we only look at the high bits and low bits, and for matches we only
1174  * look at whether the match is long or not.  The assumption is that for typical
1175  * "real" data, places that are good block boundaries will tend to be noticable
1176  * based only on changes in these aggregate frequencies, without looking for
1177  * subtle differences in individual symbols.  For example, a change from ASCII
1178  * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1179  * to many matches (generally more compressible), would be easily noticed based
1180  * on the aggregates.
1181  *
1182  * For determining whether the frequency distributions are "different enough" to
1183  * start a new block, the simply heuristic of splitting when the sum of absolute
1184  * differences exceeds a constant seems to be good enough.
1185  *
1186  * Finally, for an approximation, it is not strictly necessary that the exact
1187  * symbols being used are considered.  With "near-optimal parsing", for example,
1188  * the actual symbols that will be used are unknown until after the block
1189  * boundary is chosen and the block has been optimized.  Since the final choices
1190  * cannot be used, we can use preliminary "greedy" choices instead.
1191  */
1192
1193 /* Initialize the block split statistics when starting a new block. */
1194 static void
1195 lzx_init_block_split_stats(struct lzx_block_split_stats *stats)
1196 {
1197         for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1198                 stats->new_observations[i] = 0;
1199                 stats->observations[i] = 0;
1200         }
1201         stats->num_new_observations = 0;
1202         stats->num_observations = 0;
1203 }
1204
1205 /* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
1206  * literal, for 8 possible literal observation types.  */
1207 static inline void
1208 lzx_observe_literal(struct lzx_block_split_stats *stats, u8 lit)
1209 {
1210         stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1211         stats->num_new_observations++;
1212 }
1213
1214 /* Match observation.  Heuristic: use one observation type for "short match" and
1215  * one observation type for "long match".  */
1216 static inline void
1217 lzx_observe_match(struct lzx_block_split_stats *stats, unsigned length)
1218 {
1219         stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
1220         stats->num_new_observations++;
1221 }
1222
1223 static bool
1224 lzx_end_block_check(struct lzx_block_split_stats *stats)
1225 {
1226         if (stats->num_observations > 0) {
1227
1228                 /* Note: to avoid slow divisions, we do not divide by
1229                  * 'num_observations', but rather do all math with the numbers
1230                  * multiplied by 'num_observations'. */
1231                 u32 total_delta = 0;
1232                 for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1233                         u32 expected = stats->observations[i] *
1234                                         stats->num_new_observations;
1235                         u32 actual = stats->new_observations[i] *
1236                                         stats->num_observations;
1237                         u32 delta = (actual > expected) ? actual - expected :
1238                                                           expected - actual;
1239                         total_delta += delta;
1240                 }
1241
1242                 /* Ready to end the block? */
1243                 if (total_delta >=
1244                     stats->num_new_observations * 7 / 8 * stats->num_observations)
1245                         return true;
1246         }
1247
1248         for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1249                 stats->num_observations += stats->new_observations[i];
1250                 stats->observations[i] += stats->new_observations[i];
1251                 stats->new_observations[i] = 0;
1252         }
1253         stats->num_new_observations = 0;
1254         return false;
1255 }
1256
1257 static inline bool
1258 lzx_should_end_block(struct lzx_block_split_stats *stats,
1259                      const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
1260 {
1261         /* Ready to check block split statistics? */
1262         if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
1263             in_next - in_block_begin < MIN_BLOCK_SIZE ||
1264             in_end - in_next < MIN_BLOCK_SIZE)
1265                 return false;
1266
1267         return lzx_end_block_check(stats);
1268 }
1269
1270 /******************************************************************************/
1271 /*                   Slower ("near-optimal") compression algorithm            */
1272 /*----------------------------------------------------------------------------*/
1273
1274 /*
1275  * Least-recently-used queue for match offsets.
1276  *
1277  * This is represented as a 64-bit integer for efficiency.  There are three
1278  * offsets of 21 bits each.  Bit 64 is garbage.
1279  */
1280 struct lzx_lru_queue {
1281         u64 R;
1282 };
1283
1284 #define LZX_QUEUE_OFFSET_SHIFT  21
1285 #define LZX_QUEUE_OFFSET_MASK   (((u64)1 << LZX_QUEUE_OFFSET_SHIFT) - 1)
1286
1287 #define LZX_QUEUE_R0_SHIFT (0 * LZX_QUEUE_OFFSET_SHIFT)
1288 #define LZX_QUEUE_R1_SHIFT (1 * LZX_QUEUE_OFFSET_SHIFT)
1289 #define LZX_QUEUE_R2_SHIFT (2 * LZX_QUEUE_OFFSET_SHIFT)
1290
1291 #define LZX_QUEUE_R0_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R0_SHIFT)
1292 #define LZX_QUEUE_R1_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R1_SHIFT)
1293 #define LZX_QUEUE_R2_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R2_SHIFT)
1294
1295 #define LZX_QUEUE_INITIALIZER {         \
1296         ((u64)1 << LZX_QUEUE_R0_SHIFT) |        \
1297         ((u64)1 << LZX_QUEUE_R1_SHIFT) |        \
1298         ((u64)1 << LZX_QUEUE_R2_SHIFT) }
1299
1300 static inline u64
1301 lzx_lru_queue_R0(struct lzx_lru_queue queue)
1302 {
1303         return (queue.R >> LZX_QUEUE_R0_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1304 }
1305
1306 static inline u64
1307 lzx_lru_queue_R1(struct lzx_lru_queue queue)
1308 {
1309         return (queue.R >> LZX_QUEUE_R1_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1310 }
1311
1312 static inline u64
1313 lzx_lru_queue_R2(struct lzx_lru_queue queue)
1314 {
1315         return (queue.R >> LZX_QUEUE_R2_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1316 }
1317
1318 /* Push a match offset onto the front (most recently used) end of the queue.  */
1319 static inline struct lzx_lru_queue
1320 lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
1321 {
1322         return (struct lzx_lru_queue) {
1323                 .R = (queue.R << LZX_QUEUE_OFFSET_SHIFT) | offset,
1324         };
1325 }
1326
1327 /* Swap a match offset to the front of the queue.  */
1328 static inline struct lzx_lru_queue
1329 lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
1330 {
1331         unsigned shift = idx * 21;
1332         const u64 mask = LZX_QUEUE_R0_MASK;
1333         const u64 mask_high = mask << shift;
1334
1335         return (struct lzx_lru_queue) {
1336                 .R = (queue.R & ~(mask | mask_high)) |
1337                      ((queue.R & mask_high) >> shift) |
1338                      ((queue.R & mask) << shift),
1339         };
1340 }
1341
1342 static inline u32
1343 lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
1344                    bool record)
1345 {
1346         u32 node_idx = block_size;
1347         u32 seq_idx = ARRAY_LEN(c->chosen_sequences) - 1;
1348         u32 lit_start_node;
1349
1350         if (record) {
1351                 /* Special value to mark last sequence  */
1352                 c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = 0x80000000;
1353                 lit_start_node = node_idx;
1354         }
1355
1356         for (;;) {
1357                 u32 item;
1358                 u32 len;
1359                 u32 offset_data;
1360                 unsigned v;
1361                 unsigned offset_slot;
1362
1363                 /* Tally literals until either a match or the beginning of the
1364                  * block is reached.  Note: the item in the node at the
1365                  * beginning of the block has all bits set, causing this loop to
1366                  * end when it is reached. */
1367                 for (;;) {
1368                         item = c->optimum_nodes[node_idx].item;
1369                         if (item & OPTIMUM_LEN_MASK)
1370                                 break;
1371                         c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
1372                         node_idx--;
1373                 }
1374
1375         #if CONSIDER_GAP_MATCHES
1376                 if (item & OPTIMUM_GAP_MATCH) {
1377
1378                         if (node_idx == 0)
1379                                 break;
1380
1381                         /* Save the literal run length for the next sequence
1382                          * (the "previous sequence" when walking backwards). */
1383                         len = item & OPTIMUM_LEN_MASK;
1384                         if (record) {
1385                                 c->chosen_sequences[seq_idx--].litrunlen =
1386                                                 lit_start_node - node_idx;
1387                                 lit_start_node = node_idx - len;
1388                         }
1389
1390                         /* Tally the rep0 match after the gap. */
1391                         v = len - LZX_MIN_MATCH_LEN;
1392                         if (record)
1393                                 c->chosen_sequences[seq_idx].adjusted_length = v;
1394                         if (v >= LZX_NUM_PRIMARY_LENS) {
1395                                 c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
1396                                 v = LZX_NUM_PRIMARY_LENS;
1397                         }
1398                         c->freqs.main[LZX_NUM_CHARS + v]++;
1399                         if (record)
1400                                 c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = v;
1401
1402                         /* Tally the literal in the gap. */
1403                         c->freqs.main[(u8)(item >> OPTIMUM_OFFSET_SHIFT)]++;
1404
1405                         /* Fall through and tally the match before the gap.
1406                          * (It was temporarily saved in the 'cost' field of the
1407                          * previous node, which was free to reuse.) */
1408                         item = c->optimum_nodes[--node_idx].cost;
1409                         node_idx -= len;
1410                 }
1411         #else /* CONSIDER_GAP_MATCHES */
1412                 if (node_idx == 0)
1413                         break;
1414         #endif /* !CONSIDER_GAP_MATCHES */
1415
1416                 len = item & OPTIMUM_LEN_MASK;
1417                 offset_data = item >> OPTIMUM_OFFSET_SHIFT;
1418
1419                 /* Save the literal run length for the next sequence (the
1420                  * "previous sequence" when walking backwards). */
1421                 if (record) {
1422                         c->chosen_sequences[seq_idx--].litrunlen =
1423                                         lit_start_node - node_idx;
1424                         node_idx -= len;
1425                         lit_start_node = node_idx;
1426                 } else {
1427                         node_idx -= len;
1428                 }
1429
1430                 /* Record a match. */
1431
1432                 /* Tally the aligned offset symbol if needed. */
1433                 if (offset_data >= 16)
1434                         c->freqs.aligned[offset_data & LZX_ALIGNED_OFFSET_BITMASK]++;
1435
1436                 /* Save the adjusted length. */
1437                 v = len - LZX_MIN_MATCH_LEN;
1438                 if (record)
1439                         c->chosen_sequences[seq_idx].adjusted_length = v;
1440
1441                 /* Tally the length symbol if needed. */
1442                 if (v >= LZX_NUM_PRIMARY_LENS) {
1443                         c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
1444                         v = LZX_NUM_PRIMARY_LENS;
1445                 }
1446
1447                 /* Tally the main symbol. */
1448                 offset_slot = lzx_get_offset_slot(c, offset_data, is_16_bit);
1449                 v += offset_slot * LZX_NUM_LEN_HEADERS;
1450                 c->freqs.main[LZX_NUM_CHARS + v]++;
1451
1452                 /* Save the adjusted offset and match header. */
1453                 if (record) {
1454                         c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr =
1455                                         (offset_data << 9) | v;
1456                 }
1457         }
1458
1459         /* Save the literal run length for the first sequence. */
1460         if (record)
1461                 c->chosen_sequences[seq_idx].litrunlen = lit_start_node - node_idx;
1462
1463         /* Return the index in c->chosen_sequences at which the lzx_sequences
1464          * begin. */
1465         return seq_idx;
1466 }
1467
1468 /*
1469  * Given the minimum-cost path computed through the item graph for the current
1470  * block, walk the path and count how many of each symbol in each Huffman-coded
1471  * alphabet would be required to output the items (matches and literals) along
1472  * the path.
1473  *
1474  * Note that the path will be walked backwards (from the end of the block to the
1475  * beginning of the block), but this doesn't matter because this function only
1476  * computes frequencies.
1477  */
1478 static inline void
1479 lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1480 {
1481         lzx_walk_item_list(c, block_size, is_16_bit, false);
1482 }
1483
1484 /*
1485  * Like lzx_tally_item_list(), but this function also generates the list of
1486  * lzx_sequences for the minimum-cost path and writes it to c->chosen_sequences,
1487  * ready to be output to the bitstream after the Huffman codes are computed.
1488  * The lzx_sequences will be written to decreasing memory addresses as the path
1489  * is walked backwards, which means they will end up in the expected
1490  * first-to-last order.  The return value is the index in c->chosen_sequences at
1491  * which the lzx_sequences begin.
1492  */
1493 static inline u32
1494 lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1495 {
1496         return lzx_walk_item_list(c, block_size, is_16_bit, true);
1497 }
1498
1499 /*
1500  * Find an inexpensive path through the graph of possible match/literal choices
1501  * for the current block.  The nodes of the graph are
1502  * c->optimum_nodes[0...block_size].  They correspond directly to the bytes in
1503  * the current block, plus one extra node for end-of-block.  The edges of the
1504  * graph are matches and literals.  The goal is to find the minimum cost path
1505  * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_size]', given the cost
1506  * model 'c->costs'.
1507  *
1508  * The algorithm works forwards, starting at 'c->optimum_nodes[0]' and
1509  * proceeding forwards one node at a time.  At each node, a selection of matches
1510  * (len >= 2), as well as the literal byte (len = 1), is considered.  An item of
1511  * length 'len' provides a new path to reach the node 'len' bytes later.  If
1512  * such a path is the lowest cost found so far to reach that later node, then
1513  * that later node is updated with the new path.
1514  *
1515  * Note that although this algorithm is based on minimum cost path search, due
1516  * to various simplifying assumptions the result is not guaranteed to be the
1517  * true minimum cost, or "optimal", path over the graph of all valid LZX
1518  * representations of this block.
1519  *
1520  * Also, note that because of the presence of the recent offsets queue (which is
1521  * a type of adaptive state), the algorithm cannot work backwards and compute
1522  * "cost to end" instead of "cost to beginning".  Furthermore, the way the
1523  * algorithm handles this adaptive state in the "minimum cost" parse is actually
1524  * only an approximation.  It's possible for the globally optimal, minimum cost
1525  * path to contain a prefix, ending at a position, where that path prefix is
1526  * *not* the minimum cost path to that position.  This can happen if such a path
1527  * prefix results in a different adaptive state which results in lower costs
1528  * later.  The algorithm does not solve this problem; it only considers the
1529  * lowest cost to reach each individual position.
1530  */
1531 static inline struct lzx_lru_queue
1532 lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
1533                        const u8 * const restrict block_begin,
1534                        const u32 block_size,
1535                        const struct lzx_lru_queue initial_queue,
1536                        bool is_16_bit)
1537 {
1538         struct lzx_optimum_node *cur_node = c->optimum_nodes;
1539         struct lzx_optimum_node * const end_node = cur_node + block_size;
1540         struct lz_match *cache_ptr = c->match_cache;
1541         const u8 *in_next = block_begin;
1542         const u8 * const block_end = block_begin + block_size;
1543
1544         /*
1545          * Instead of storing the match offset LRU queues in the
1546          * 'lzx_optimum_node' structures, we save memory (and cache lines) by
1547          * storing them in a smaller array.  This works because the algorithm
1548          * only requires a limited history of the adaptive state.  Once a given
1549          * state is more than LZX_MAX_MATCH_LEN bytes behind the current node
1550          * (more if gap match consideration is enabled; we just round up to 512
1551          * so it's a power of 2), it is no longer needed.
1552          */
1553         struct lzx_lru_queue queues[512];
1554         STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1);
1555         STATIC_ASSERT(sizeof(c->optimum_nodes[0]) == sizeof(queues[0]));
1556 #define QUEUE(node) \
1557         (*(struct lzx_lru_queue *)((char *)queues + \
1558                         ((uintptr_t)(node) % (ARRAY_LEN(queues) * sizeof(*(node))))))
1559         /*(queues[(uintptr_t)(node) / sizeof(*(node)) % ARRAY_LEN(queues)])*/
1560
1561 #if CONSIDER_GAP_MATCHES
1562         u32 matches_before_gap[ARRAY_LEN(queues)];
1563 #define MATCH_BEFORE_GAP(node) \
1564         (matches_before_gap[(uintptr_t)(node) / sizeof(*(node)) % \
1565                             ARRAY_LEN(matches_before_gap)])
1566 #endif
1567
1568         /*
1569          * Initially, the cost to reach each node is "infinity".
1570          *
1571          * The first node actually should have cost 0, but "infinity"
1572          * (0xFFFFFFFF) works just as well because it immediately overflows.
1573          *
1574          * This statement also intentionally sets the 'item' of the first node,
1575          * which would otherwise have no meaning, to 0xFFFFFFFF for use as a
1576          * sentinel.  See lzx_walk_item_list().
1577          */
1578         memset(c->optimum_nodes, 0xFF,
1579                (block_size + 1) * sizeof(c->optimum_nodes[0]));
1580
1581         /* Initialize the recent offsets queue for the first node. */
1582         QUEUE(cur_node) = initial_queue;
1583
1584         do { /* For each node in the block in position order... */
1585
1586                 unsigned num_matches;
1587                 unsigned literal;
1588                 u32 cost;
1589
1590                 /*
1591                  * A selection of matches for the block was already saved in
1592                  * memory so that we don't have to run the uncompressed data
1593                  * through the matchfinder on every optimization pass.  However,
1594                  * we still search for repeat offset matches during each
1595                  * optimization pass because we cannot predict the state of the
1596                  * recent offsets queue.  But as a heuristic, we don't bother
1597                  * searching for repeat offset matches if the general-purpose
1598                  * matchfinder failed to find any matches.
1599                  *
1600                  * Note that a match of length n at some offset implies there is
1601                  * also a match of length l for LZX_MIN_MATCH_LEN <= l <= n at
1602                  * that same offset.  In other words, we don't necessarily need
1603                  * to use the full length of a match.  The key heuristic that
1604                  * saves a significicant amount of time is that for each
1605                  * distinct length, we only consider the smallest offset for
1606                  * which that length is available.  This heuristic also applies
1607                  * to repeat offsets, which we order specially: R0 < R1 < R2 <
1608                  * any explicit offset.  Of course, this heuristic may be
1609                  * produce suboptimal results because offset slots in LZX are
1610                  * subject to entropy encoding, but in practice this is a useful
1611                  * heuristic.
1612                  */
1613
1614                 num_matches = cache_ptr->length;
1615                 cache_ptr++;
1616
1617                 if (num_matches) {
1618                         struct lz_match *end_matches = cache_ptr + num_matches;
1619                         unsigned next_len = LZX_MIN_MATCH_LEN;
1620                         unsigned max_len = min(block_end - in_next, LZX_MAX_MATCH_LEN);
1621                         const u8 *matchptr;
1622
1623                         /* Consider rep0 matches. */
1624                         matchptr = in_next - lzx_lru_queue_R0(QUEUE(cur_node));
1625                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1626                                 goto rep0_done;
1627                         STATIC_ASSERT(LZX_MIN_MATCH_LEN == 2);
1628                         do {
1629                                 u32 cost = cur_node->cost +
1630                                            c->costs.match_cost[0][
1631                                                         next_len - LZX_MIN_MATCH_LEN];
1632                                 if (cost <= (cur_node + next_len)->cost) {
1633                                         (cur_node + next_len)->cost = cost;
1634                                         (cur_node + next_len)->item =
1635                                                 (0 << OPTIMUM_OFFSET_SHIFT) | next_len;
1636                                 }
1637                                 if (unlikely(++next_len > max_len)) {
1638                                         cache_ptr = end_matches;
1639                                         goto done_matches;
1640                                 }
1641                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1642
1643                 rep0_done:
1644
1645                         /* Consider rep1 matches. */
1646                         matchptr = in_next - lzx_lru_queue_R1(QUEUE(cur_node));
1647                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1648                                 goto rep1_done;
1649                         if (matchptr[next_len - 1] != in_next[next_len - 1])
1650                                 goto rep1_done;
1651                         for (unsigned len = 2; len < next_len - 1; len++)
1652                                 if (matchptr[len] != in_next[len])
1653                                         goto rep1_done;
1654                         do {
1655                                 u32 cost = cur_node->cost +
1656                                            c->costs.match_cost[1][
1657                                                         next_len - LZX_MIN_MATCH_LEN];
1658                                 if (cost <= (cur_node + next_len)->cost) {
1659                                         (cur_node + next_len)->cost = cost;
1660                                         (cur_node + next_len)->item =
1661                                                 (1 << OPTIMUM_OFFSET_SHIFT) | next_len;
1662                                 }
1663                                 if (unlikely(++next_len > max_len)) {
1664                                         cache_ptr = end_matches;
1665                                         goto done_matches;
1666                                 }
1667                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1668
1669                 rep1_done:
1670
1671                         /* Consider rep2 matches. */
1672                         matchptr = in_next - lzx_lru_queue_R2(QUEUE(cur_node));
1673                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1674                                 goto rep2_done;
1675                         if (matchptr[next_len - 1] != in_next[next_len - 1])
1676                                 goto rep2_done;
1677                         for (unsigned len = 2; len < next_len - 1; len++)
1678                                 if (matchptr[len] != in_next[len])
1679                                         goto rep2_done;
1680                         do {
1681                                 u32 cost = cur_node->cost +
1682                                            c->costs.match_cost[2][
1683                                                         next_len - LZX_MIN_MATCH_LEN];
1684                                 if (cost <= (cur_node + next_len)->cost) {
1685                                         (cur_node + next_len)->cost = cost;
1686                                         (cur_node + next_len)->item =
1687                                                 (2 << OPTIMUM_OFFSET_SHIFT) | next_len;
1688                                 }
1689                                 if (unlikely(++next_len > max_len)) {
1690                                         cache_ptr = end_matches;
1691                                         goto done_matches;
1692                                 }
1693                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1694
1695                 rep2_done:
1696
1697                         while (next_len > cache_ptr->length)
1698                                 if (++cache_ptr == end_matches)
1699                                         goto done_matches;
1700
1701                         /* Consider explicit offset matches. */
1702                         for (;;) {
1703                                 u32 offset = cache_ptr->offset;
1704                                 u32 offset_data = offset + LZX_OFFSET_ADJUSTMENT;
1705                                 unsigned offset_slot = lzx_get_offset_slot(c, offset_data, is_16_bit);
1706                                 u32 base_cost = cur_node->cost;
1707                                 u32 cost;
1708
1709                         #if CONSIDER_ALIGNED_COSTS
1710                                 if (offset >= 16 - LZX_OFFSET_ADJUSTMENT)
1711                                         base_cost += c->costs.aligned[offset_data &
1712                                                                       LZX_ALIGNED_OFFSET_BITMASK];
1713                         #endif
1714                                 do {
1715                                         cost = base_cost +
1716                                                c->costs.match_cost[offset_slot][
1717                                                                 next_len - LZX_MIN_MATCH_LEN];
1718                                         if (cost < (cur_node + next_len)->cost) {
1719                                                 (cur_node + next_len)->cost = cost;
1720                                                 (cur_node + next_len)->item =
1721                                                         (offset_data << OPTIMUM_OFFSET_SHIFT) | next_len;
1722                                         }
1723                                 } while (++next_len <= cache_ptr->length);
1724
1725                                 if (++cache_ptr == end_matches) {
1726                                 #if CONSIDER_GAP_MATCHES
1727                                         /* Also consider the longest explicit
1728                                          * offset match as a "gap match": match
1729                                          * + lit + rep0. */
1730                                         s32 remaining = (block_end - in_next) - next_len;
1731                                         if (likely(remaining >= 2)) {
1732                                                 const u8 *strptr = in_next + next_len;
1733                                                 const u8 *matchptr = strptr - offset;
1734                                                 if (load_u16_unaligned(strptr) == load_u16_unaligned(matchptr)) {
1735                                                         STATIC_ASSERT(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2 >= 250);
1736                                                         STATIC_ASSERT(ARRAY_LEN(queues) == ARRAY_LEN(matches_before_gap));
1737                                                         u32 limit = min(remaining,
1738                                                                         min(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2,
1739                                                                             LZX_MAX_MATCH_LEN));
1740                                                         u32 rep0_len = lz_extend(strptr, matchptr, 2, limit);
1741                                                         u8 lit = strptr[-1];
1742                                                         cost += c->costs.main[lit] +
1743                                                                 c->costs.match_cost[0][rep0_len - LZX_MIN_MATCH_LEN];
1744                                                         u32 total_len = next_len + rep0_len;
1745                                                         if (cost < (cur_node + total_len)->cost) {
1746                                                                 (cur_node + total_len)->cost = cost;
1747                                                                 (cur_node + total_len)->item =
1748                                                                         OPTIMUM_GAP_MATCH |
1749                                                                         ((u32)lit << OPTIMUM_OFFSET_SHIFT) |
1750                                                                         rep0_len;
1751                                                                 MATCH_BEFORE_GAP(cur_node + total_len) =
1752                                                                         (offset_data << OPTIMUM_OFFSET_SHIFT) |
1753                                                                         (next_len - 1);
1754                                                         }
1755                                                 }
1756                                         }
1757                                 #endif /* CONSIDER_GAP_MATCHES */
1758                                         break;
1759                                 }
1760                         }
1761                 }
1762
1763         done_matches:
1764
1765                 /* Consider coding a literal.
1766
1767                  * To avoid an extra branch, actually checking the preferability
1768                  * of coding the literal is integrated into the queue update
1769                  * code below. */
1770                 literal = *in_next++;
1771                 cost = cur_node->cost + c->costs.main[literal];
1772
1773                 /* Advance to the next position. */
1774                 cur_node++;
1775
1776                 /* The lowest-cost path to the current position is now known.
1777                  * Finalize the recent offsets queue that results from taking
1778                  * this lowest-cost path. */
1779
1780                 if (cost <= cur_node->cost) {
1781                         /* Literal: queue remains unchanged. */
1782                         cur_node->cost = cost;
1783                         cur_node->item = (u32)literal << OPTIMUM_OFFSET_SHIFT;
1784                         QUEUE(cur_node) = QUEUE(cur_node - 1);
1785                 } else {
1786                         /* Match: queue update is needed. */
1787                         unsigned len = cur_node->item & OPTIMUM_LEN_MASK;
1788                 #if CONSIDER_GAP_MATCHES
1789                         s32 offset_data = (s32)cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1790                         STATIC_ASSERT(OPTIMUM_GAP_MATCH == 0x80000000); /* assuming sign extension */
1791                 #else
1792                         u32 offset_data = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1793                 #endif
1794
1795                         if (offset_data >= LZX_NUM_RECENT_OFFSETS) {
1796                                 /* Explicit offset match: insert offset at front. */
1797                                 QUEUE(cur_node) =
1798                                         lzx_lru_queue_push(QUEUE(cur_node - len),
1799                                                            offset_data - LZX_OFFSET_ADJUSTMENT);
1800                         } else
1801                 #if CONSIDER_GAP_MATCHES
1802                         if (offset_data < 0) {
1803                                 /* "Gap match": Explicit offset match, then a
1804                                  * literal, then rep0 match.  Save the explicit
1805                                  * offset match information in the cost field of
1806                                  * the previous node, which isn't needed
1807                                  * anymore.  Then insert the offset at the front
1808                                  * of the queue. */
1809                                 u32 match_before_gap = MATCH_BEFORE_GAP(cur_node);
1810                                 (cur_node - 1)->cost = match_before_gap;
1811                                 QUEUE(cur_node) =
1812                                         lzx_lru_queue_push(QUEUE(cur_node - len - 1 -
1813                                                                  (match_before_gap & OPTIMUM_LEN_MASK)),
1814                                                            (match_before_gap >> OPTIMUM_OFFSET_SHIFT) -
1815                                                            LZX_OFFSET_ADJUSTMENT);
1816                         } else
1817                 #endif
1818                         {
1819                                 /* Repeat offset match: swap offset to front. */
1820                                 QUEUE(cur_node) =
1821                                         lzx_lru_queue_swap(QUEUE(cur_node - len),
1822                                                            offset_data);
1823                         }
1824                 }
1825         } while (cur_node != end_node);
1826
1827         /* Return the recent offsets queue at the end of the path. */
1828         return QUEUE(cur_node);
1829 }
1830
1831 /* Given the costs for the main and length codewords, compute 'match_costs'.  */
1832 static void
1833 lzx_compute_match_costs(struct lzx_compressor *c)
1834 {
1835         unsigned num_offset_slots = (c->num_main_syms - LZX_NUM_CHARS) /
1836                                         LZX_NUM_LEN_HEADERS;
1837         struct lzx_costs *costs = &c->costs;
1838
1839         for (unsigned offset_slot = 0; offset_slot < num_offset_slots;
1840              offset_slot++)
1841         {
1842                 u32 extra_cost = (u32)lzx_extra_offset_bits[offset_slot] <<
1843                                                                 COST_SHIFT;
1844                 unsigned main_symbol = LZX_NUM_CHARS + (offset_slot *
1845                                                         LZX_NUM_LEN_HEADERS);
1846                 unsigned i;
1847
1848         #if CONSIDER_ALIGNED_COSTS
1849                 if (offset_slot >= 8)
1850                         extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS << COST_SHIFT;
1851         #endif
1852
1853                 for (i = 0; i < LZX_NUM_PRIMARY_LENS; i++) {
1854                         costs->match_cost[offset_slot][i] =
1855                                 costs->main[main_symbol++] + extra_cost;
1856                 }
1857
1858                 extra_cost += costs->main[main_symbol];
1859
1860                 for (; i < LZX_NUM_LENS; i++) {
1861                         costs->match_cost[offset_slot][i] =
1862                                 costs->len[i - LZX_NUM_PRIMARY_LENS] +
1863                                 extra_cost;
1864                 }
1865         }
1866 }
1867
1868 /* Set default LZX Huffman symbol costs to bootstrap the iterative optimization
1869  * algorithm.  */
1870 static void
1871 lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_size)
1872 {
1873         u32 i;
1874         bool have_byte[256];
1875         unsigned num_used_bytes;
1876
1877         /* The costs below are hard coded to use a COST_SHIFT of 6.  */
1878         STATIC_ASSERT(COST_SHIFT == 6);
1879
1880         /*
1881          * Heuristics:
1882          *
1883          * - Use smaller initial costs for literal symbols when the input buffer
1884          *   contains fewer distinct bytes.
1885          *
1886          * - Assume that match symbols are more costly than literal symbols.
1887          *
1888          * - Assume that length symbols for shorter lengths are less costly than
1889          *   length symbols for longer lengths.
1890          */
1891
1892         for (i = 0; i < 256; i++)
1893                 have_byte[i] = false;
1894
1895         for (i = 0; i < block_size; i++)
1896                 have_byte[block[i]] = true;
1897
1898         num_used_bytes = 0;
1899         for (i = 0; i < 256; i++)
1900                 num_used_bytes += have_byte[i];
1901
1902         for (i = 0; i < 256; i++)
1903                 c->costs.main[i] = 560 - (256 - num_used_bytes);
1904
1905         for (; i < c->num_main_syms; i++)
1906                 c->costs.main[i] = 680;
1907
1908         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
1909                 c->costs.len[i] = 412 + i;
1910
1911 #if CONSIDER_ALIGNED_COSTS
1912         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
1913                 c->costs.aligned[i] = LZX_NUM_ALIGNED_OFFSET_BITS << COST_SHIFT;
1914 #endif
1915
1916         lzx_compute_match_costs(c);
1917 }
1918
1919 /* Update the current cost model to reflect the computed Huffman codes.  */
1920 static void
1921 lzx_set_costs_from_codes(struct lzx_compressor *c)
1922 {
1923         unsigned i;
1924         const struct lzx_lens *lens = &c->codes[c->codes_index].lens;
1925
1926         for (i = 0; i < c->num_main_syms; i++) {
1927                 c->costs.main[i] = (lens->main[i] ? lens->main[i] :
1928                                     MAIN_CODEWORD_LIMIT) << COST_SHIFT;
1929         }
1930
1931         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) {
1932                 c->costs.len[i] = (lens->len[i] ? lens->len[i] :
1933                                    LENGTH_CODEWORD_LIMIT) << COST_SHIFT;
1934         }
1935
1936 #if CONSIDER_ALIGNED_COSTS
1937         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1938                 c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] :
1939                                        ALIGNED_CODEWORD_LIMIT) << COST_SHIFT;
1940         }
1941 #endif
1942
1943         lzx_compute_match_costs(c);
1944 }
1945
1946 /*
1947  * Choose a "near-optimal" literal/match sequence to use for the current block,
1948  * then flush the block.  Because the cost of each Huffman symbol is unknown
1949  * until the Huffman codes have been built and the Huffman codes themselves
1950  * depend on the symbol frequencies, this uses an iterative optimization
1951  * algorithm to approximate an optimal solution.  The first optimization pass
1952  * for the block uses default costs.  Additional passes use costs taken from the
1953  * Huffman codes computed in the previous pass.
1954  */
1955 static inline struct lzx_lru_queue
1956 lzx_optimize_and_flush_block(struct lzx_compressor * const restrict c,
1957                              struct lzx_output_bitstream * const restrict os,
1958                              const u8 * const restrict block_begin,
1959                              const u32 block_size,
1960                              const struct lzx_lru_queue initial_queue,
1961                              bool is_16_bit)
1962 {
1963         unsigned num_passes_remaining = c->num_optim_passes;
1964         struct lzx_lru_queue new_queue;
1965         u32 seq_idx;
1966
1967         lzx_set_default_costs(c, block_begin, block_size);
1968
1969         for (;;) {
1970                 new_queue = lzx_find_min_cost_path(c, block_begin, block_size,
1971                                                    initial_queue, is_16_bit);
1972
1973                 if (--num_passes_remaining == 0)
1974                         break;
1975
1976                 /* At least one optimization pass remains.  Update the costs. */
1977                 lzx_reset_symbol_frequencies(c);
1978                 lzx_tally_item_list(c, block_size, is_16_bit);
1979                 lzx_build_huffman_codes(c);
1980                 lzx_set_costs_from_codes(c);
1981         }
1982
1983         /* Done optimizing.  Generate the sequence list and flush the block. */
1984         lzx_reset_symbol_frequencies(c);
1985         seq_idx = lzx_record_item_list(c, block_size, is_16_bit);
1986         lzx_flush_block(c, os, block_begin, block_size, seq_idx);
1987         return new_queue;
1988 }
1989
1990 /*
1991  * This is the "near-optimal" LZX compressor.
1992  *
1993  * For each block, it performs a relatively thorough graph search to find an
1994  * inexpensive (in terms of compressed size) way to output the block.
1995  *
1996  * Note: there are actually many things this algorithm leaves on the table in
1997  * terms of compression ratio.  So although it may be "near-optimal", it is
1998  * certainly not "optimal".  The goal is not to produce the optimal compression
1999  * ratio, which for LZX is probably impossible within any practical amount of
2000  * time, but rather to produce a compression ratio significantly better than a
2001  * simpler "greedy" or "lazy" parse while still being relatively fast.
2002  */
2003 static inline void
2004 lzx_compress_near_optimal(struct lzx_compressor * restrict c,
2005                           const u8 * const restrict in_begin, size_t in_nbytes,
2006                           struct lzx_output_bitstream * restrict os,
2007                           bool is_16_bit)
2008 {
2009         const u8 *       in_next = in_begin;
2010         const u8 * const in_end  = in_begin + in_nbytes;
2011         u32 max_len = LZX_MAX_MATCH_LEN;
2012         u32 nice_len = min(c->nice_match_length, max_len);
2013         u32 next_hashes[2] = {0, 0};
2014         struct lzx_lru_queue queue = LZX_QUEUE_INITIALIZER;
2015
2016         /* Initialize the matchfinder. */
2017         CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
2018
2019         do {
2020                 /* Starting a new block */
2021                 const u8 * const in_block_begin = in_next;
2022                 const u8 * const in_max_block_end =
2023                         in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2024                 struct lz_match *cache_ptr = c->match_cache;
2025                 const u8 *next_search_pos = in_next;
2026                 const u8 *next_observation =
2027                         (in_max_block_end - in_next < MIN_BLOCK_SIZE) ?
2028                          in_max_block_end : in_next;
2029                 const u8 *next_pause_point =
2030                         min(in_next + min(MIN_BLOCK_SIZE,
2031                                           in_max_block_end - in_next),
2032                             in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2033                                                    in_max_block_end - in_next));
2034
2035                 lzx_init_block_split_stats(&c->split_stats);
2036
2037                 /*
2038                  * Run the input buffer through the matchfinder, caching the
2039                  * matches, until we decide to end the block.
2040                  *
2041                  * For a tighter matchfinding loop, we compute a "pause point",
2042                  * which is the next position at which we may need to check
2043                  * whether to end the block or to decrease max_len.  We then
2044                  * only do these extra checks upon reaching the pause point.
2045                  */
2046         resume_matchfinding:
2047                 do {
2048                         if (in_next >= next_search_pos) {
2049                                 /* Search for matches at this position. */
2050                                 struct lz_match *lz_matchptr;
2051                                 u32 best_len;
2052
2053                                 lz_matchptr = CALL_BT_MF(is_16_bit, c,
2054                                                          bt_matchfinder_get_matches,
2055                                                          in_begin,
2056                                                          in_next - in_begin,
2057                                                          max_len,
2058                                                          nice_len,
2059                                                          c->max_search_depth,
2060                                                          next_hashes,
2061                                                          &best_len,
2062                                                          cache_ptr + 1);
2063                                 cache_ptr->length = lz_matchptr - (cache_ptr + 1);
2064                                 cache_ptr = lz_matchptr;
2065
2066                                 /* Accumulate block split statistics. */
2067                                 if (in_next >= next_observation) {
2068                                         if (best_len >= 4) {
2069                                                 lzx_observe_match(&c->split_stats,
2070                                                                   best_len);
2071                                                 next_observation = in_next + best_len;
2072                                         } else {
2073                                                 lzx_observe_literal(&c->split_stats,
2074                                                                     *in_next);
2075                                                 next_observation = in_next + 1;
2076                                         }
2077                                 }
2078
2079                                 /*
2080                                  * If there was a very long match found, then
2081                                  * don't cache any matches for the bytes covered
2082                                  * by that match.  This avoids degenerate
2083                                  * behavior when compressing highly redundant
2084                                  * data, where the number of matches can be very
2085                                  * large.
2086                                  *
2087                                  * This heuristic doesn't actually hurt the
2088                                  * compression ratio very much.  If there's a
2089                                  * long match, then the data must be highly
2090                                  * compressible, so it doesn't matter as much
2091                                  * what we do.
2092                                  */
2093                                 if (best_len >= nice_len)
2094                                         next_search_pos = in_next + best_len;
2095                         } else {
2096                                 /* Don't search for matches at this position. */
2097                                 CALL_BT_MF(is_16_bit, c,
2098                                            bt_matchfinder_skip_position,
2099                                            in_begin,
2100                                            in_next - in_begin,
2101                                            nice_len,
2102                                            c->max_search_depth,
2103                                            next_hashes);
2104                                 cache_ptr->length = 0;
2105                                 cache_ptr++;
2106                         }
2107                 } while (++in_next < next_pause_point &&
2108                          likely(cache_ptr < &c->match_cache[CACHE_LENGTH]));
2109
2110                 /* Adjust max_len and nice_len if we're nearing the end of the
2111                  * input buffer. In addition, if we are so close to the end of
2112                  * the input buffer that there cannot be any more matches, then
2113                  * just advance through the last few positions and record no
2114                  * matches. */
2115                 if (unlikely(max_len > in_end - in_next)) {
2116                         max_len = in_end - in_next;
2117                         nice_len = min(max_len, nice_len);
2118                         if (max_len < BT_MATCHFINDER_REQUIRED_NBYTES) {
2119                                 while (in_next != in_end) {
2120                                         cache_ptr->length = 0;
2121                                         cache_ptr++;
2122                                         in_next++;
2123                                 }
2124                         }
2125                 }
2126
2127                 /* End the block if the match cache may overflow. */
2128                 if (unlikely(cache_ptr >= &c->match_cache[CACHE_LENGTH]))
2129                         goto end_block;
2130
2131                 /* End the block if the soft maximum size has been reached. */
2132                 if (in_next >= in_max_block_end)
2133                         goto end_block;
2134
2135                 /* End the block if the block splitting algorithm thinks this is
2136                  * a good place to do so. */
2137                 if (c->split_stats.num_new_observations >=
2138                     NUM_OBSERVATIONS_PER_BLOCK_CHECK)
2139                 {
2140                         if (in_max_block_end - in_next < MIN_BLOCK_SIZE)
2141                                 next_observation = in_max_block_end;
2142                         else if (lzx_end_block_check(&c->split_stats))
2143                                 goto end_block;
2144                 }
2145
2146                 /* It's not time to end the block yet. Compute the next pause
2147                  * point and resume the matchfinding. */
2148                 next_pause_point =
2149                         min(in_next + min(NUM_OBSERVATIONS_PER_BLOCK_CHECK * 2 -
2150                                             c->split_stats.num_new_observations,
2151                                           in_max_block_end - in_next),
2152                             in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2153                                                    in_max_block_end - in_next));
2154                 goto resume_matchfinding;
2155
2156         end_block:
2157                 /* We've decided on a block boundary and cached matches.  Now
2158                  * choose a match/literal sequence and flush the block. */
2159                 queue = lzx_optimize_and_flush_block(c, os, in_block_begin,
2160                                                      in_next - in_block_begin,
2161                                                      queue, is_16_bit);
2162         } while (in_next != in_end);
2163 }
2164
2165 static void
2166 lzx_compress_near_optimal_16(struct lzx_compressor *c, const u8 *in,
2167                              size_t in_nbytes, struct lzx_output_bitstream *os)
2168 {
2169         lzx_compress_near_optimal(c, in, in_nbytes, os, true);
2170 }
2171
2172 static void
2173 lzx_compress_near_optimal_32(struct lzx_compressor *c, const u8 *in,
2174                              size_t in_nbytes, struct lzx_output_bitstream *os)
2175 {
2176         lzx_compress_near_optimal(c, in, in_nbytes, os, false);
2177 }
2178
2179 /******************************************************************************/
2180 /*                     Faster ("lazy") compression algorithm                  */
2181 /*----------------------------------------------------------------------------*/
2182
2183 /*
2184  * Called when the compressor decides to use a literal.  This tallies the
2185  * Huffman symbol for the literal and increment the current literal run length.
2186  */
2187 static inline void
2188 lzx_choose_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
2189 {
2190         lzx_observe_literal(&c->split_stats, literal);
2191         c->freqs.main[literal]++;
2192         ++*litrunlen_p;
2193 }
2194
2195 /*
2196  * Called when the compressor decides to use a match.  This tallies the Huffman
2197  * symbol(s) for a match, saves the match data and the length of the preceding
2198  * literal run, and updates the recent offsets queue.
2199  */
2200 static inline void
2201 lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 offset_data,
2202                  u32 recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit,
2203                  u32 *litrunlen_p, struct lzx_sequence **next_seq_p)
2204 {
2205         u32 litrunlen = *litrunlen_p;
2206         struct lzx_sequence *next_seq = *next_seq_p;
2207         unsigned offset_slot;
2208         unsigned v;
2209
2210         lzx_observe_match(&c->split_stats, length);
2211
2212         v = length - LZX_MIN_MATCH_LEN;
2213
2214         /* Save the literal run length and adjusted length. */
2215         next_seq->litrunlen = litrunlen;
2216         next_seq->adjusted_length = v;
2217
2218         /* Compute the length header, then tally the length symbol if needed. */
2219         if (v >= LZX_NUM_PRIMARY_LENS) {
2220                 c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
2221                 v = LZX_NUM_PRIMARY_LENS;
2222         }
2223
2224         /* Compute the offset slot. */
2225         offset_slot = lzx_get_offset_slot(c, offset_data, is_16_bit);
2226
2227         /* Compute the match header. */
2228         v += offset_slot * LZX_NUM_LEN_HEADERS;
2229
2230         /* Save the adjusted offset and match header. */
2231         next_seq->adjusted_offset_and_match_hdr = (offset_data << 9) | v;
2232
2233         /* Tally the main symbol. */
2234         c->freqs.main[LZX_NUM_CHARS + v]++;
2235
2236         /* Update the recent offsets queue. */
2237         if (offset_data < LZX_NUM_RECENT_OFFSETS) {
2238                 /* Repeat offset match. */
2239                 swap(recent_offsets[0], recent_offsets[offset_data]);
2240         } else {
2241                 /* Explicit offset match. */
2242
2243                 /* Tally the aligned offset symbol if needed. */
2244                 if (offset_data >= 16)
2245                         c->freqs.aligned[offset_data & LZX_ALIGNED_OFFSET_BITMASK]++;
2246
2247                 recent_offsets[2] = recent_offsets[1];
2248                 recent_offsets[1] = recent_offsets[0];
2249                 recent_offsets[0] = offset_data - LZX_OFFSET_ADJUSTMENT;
2250         }
2251
2252         /* Reset the literal run length and advance to the next sequence. */
2253         *next_seq_p = next_seq + 1;
2254         *litrunlen_p = 0;
2255 }
2256
2257 /*
2258  * Called when the compressor ends a block.  This finshes the last lzx_sequence,
2259  * which is just a literal run which no following match.  This literal run might
2260  * be empty.
2261  */
2262 static inline void
2263 lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
2264 {
2265         last_seq->litrunlen = litrunlen;
2266
2267         /* Special value to mark last sequence */
2268         last_seq->adjusted_offset_and_match_hdr = 0x80000000;
2269 }
2270
2271 /*
2272  * Find the longest repeat offset match with the current position.  If a match
2273  * is found, return its length and set *best_rep_idx_ret to the index of its
2274  * offset in @recent_offsets.  Otherwise, return 0.
2275  *
2276  * Don't bother with length 2 matches; consider matches of length >= 3 only.
2277  */
2278 static unsigned
2279 lzx_find_longest_repeat_offset_match(const u8 * const in_next,
2280                                      const u32 recent_offsets[],
2281                                      const unsigned max_len,
2282                                      unsigned *best_rep_idx_ret)
2283 {
2284         STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); /* loop is unrolled */
2285
2286         const u32 seq3 = load_u24_unaligned(in_next);
2287         const u8 *matchptr;
2288         unsigned best_rep_len;
2289         unsigned best_rep_idx;
2290         unsigned rep_len;
2291
2292         /* Check for rep0 match (most recent offset) */
2293         matchptr = in_next - recent_offsets[0];
2294         if (load_u24_unaligned(matchptr) == seq3)
2295                 best_rep_len = lz_extend(in_next, matchptr, 3, max_len);
2296         else
2297                 best_rep_len = 0;
2298         best_rep_idx = 0;
2299
2300         /* Check for rep1 match (second most recent offset) */
2301         matchptr = in_next - recent_offsets[1];
2302         if (load_u24_unaligned(matchptr) == seq3) {
2303                 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2304                 if (rep_len > best_rep_len) {
2305                         best_rep_len = rep_len;
2306                         best_rep_idx = 1;
2307                 }
2308         }
2309
2310         /* Check for rep2 match (third most recent offset) */
2311         matchptr = in_next - recent_offsets[2];
2312         if (load_u24_unaligned(matchptr) == seq3) {
2313                 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2314                 if (rep_len > best_rep_len) {
2315                         best_rep_len = rep_len;
2316                         best_rep_idx = 2;
2317                 }
2318         }
2319
2320         *best_rep_idx_ret = best_rep_idx;
2321         return best_rep_len;
2322 }
2323
2324 /*
2325  * Fast heuristic scoring for lazy parsing: how "good" is this match?
2326  * This is mainly determined by the length: longer matches are better.
2327  * However, we also give a bonus to close (small offset) matches and to repeat
2328  * offset matches, since those require fewer bits to encode.
2329  */
2330
2331 static inline unsigned
2332 lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
2333 {
2334         unsigned score = len;
2335
2336         if (adjusted_offset < 4096)
2337                 score++;
2338         if (adjusted_offset < 256)
2339                 score++;
2340
2341         return score;
2342 }
2343
2344 static inline unsigned
2345 lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
2346 {
2347         return rep_len + 3;
2348 }
2349
2350 /*
2351  * This is the "lazy" LZX compressor.  The basic idea is that before it chooses
2352  * a match, it checks to see if there's a longer match at the next position.  If
2353  * yes, it chooses a literal and continues to the next position.  If no, it
2354  * chooses the match.
2355  *
2356  * Some additional heuristics are used as well.  Repeat offset matches are
2357  * considered favorably and sometimes are chosen immediately.  In addition, long
2358  * matches (at least "nice_len" bytes) are chosen immediately as well.  Finally,
2359  * when we decide whether a match is "better" than another, we take the offset
2360  * into consideration as well as the length.
2361  */
2362 static inline void
2363 lzx_compress_lazy(struct lzx_compressor * restrict c,
2364                   const u8 * const restrict in_begin, size_t in_nbytes,
2365                   struct lzx_output_bitstream * restrict os, bool is_16_bit)
2366 {
2367         const u8 *       in_next = in_begin;
2368         const u8 * const in_end  = in_begin + in_nbytes;
2369         unsigned max_len = LZX_MAX_MATCH_LEN;
2370         unsigned nice_len = min(c->nice_match_length, max_len);
2371         STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
2372         u32 recent_offsets[3] = {1, 1, 1};
2373         u32 next_hashes[2] = {0, 0};
2374
2375         /* Initialize the matchfinder. */
2376         CALL_HC_MF(is_16_bit, c, hc_matchfinder_init);
2377
2378         do {
2379                 /* Starting a new block */
2380
2381                 const u8 * const in_block_begin = in_next;
2382                 const u8 * const in_max_block_end =
2383                         in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2384                 struct lzx_sequence *next_seq = c->chosen_sequences;
2385                 u32 litrunlen = 0;
2386                 unsigned cur_len;
2387                 u32 cur_offset;
2388                 u32 cur_offset_data;
2389                 unsigned cur_score;
2390                 unsigned next_len;
2391                 u32 next_offset;
2392                 u32 next_offset_data;
2393                 unsigned next_score;
2394                 unsigned best_rep_len;
2395                 unsigned best_rep_idx;
2396                 unsigned rep_score;
2397                 unsigned skip_len;
2398
2399                 lzx_reset_symbol_frequencies(c);
2400                 lzx_init_block_split_stats(&c->split_stats);
2401
2402                 do {
2403                         /* Adjust max_len and nice_len if we're nearing the end
2404                          * of the input buffer. */
2405                         if (unlikely(max_len > in_end - in_next)) {
2406                                 max_len = in_end - in_next;
2407                                 nice_len = min(max_len, nice_len);
2408                         }
2409
2410                         /* Find the longest match (subject to the
2411                          * max_search_depth cutoff parameter) with the current
2412                          * position.  Don't bother with length 2 matches; only
2413                          * look for matches of length >= 3. */
2414                         cur_len = CALL_HC_MF(is_16_bit, c,
2415                                              hc_matchfinder_longest_match,
2416                                              in_begin,
2417                                              in_next - in_begin,
2418                                              2,
2419                                              max_len,
2420                                              nice_len,
2421                                              c->max_search_depth,
2422                                              next_hashes,
2423                                              &cur_offset);
2424
2425                         /* If there was no match found, or the only match found
2426                          * was a distant short match, then choose a literal. */
2427                         if (cur_len < 3 ||
2428                             (cur_len == 3 &&
2429                              cur_offset >= 8192 - LZX_OFFSET_ADJUSTMENT &&
2430                              cur_offset != recent_offsets[0] &&
2431                              cur_offset != recent_offsets[1] &&
2432                              cur_offset != recent_offsets[2]))
2433                         {
2434                                 lzx_choose_literal(c, *in_next, &litrunlen);
2435                                 in_next++;
2436                                 continue;
2437                         }
2438
2439                         /* Heuristic: if this match has the most recent offset,
2440                          * then go ahead and choose it as a rep0 match. */
2441                         if (cur_offset == recent_offsets[0]) {
2442                                 in_next++;
2443                                 cur_offset_data = 0;
2444                                 skip_len = cur_len - 1;
2445                                 goto choose_cur_match;
2446                         }
2447
2448                         /* Compute the longest match's score as an explicit
2449                          * offset match. */
2450                         cur_offset_data = cur_offset + LZX_OFFSET_ADJUSTMENT;
2451                         cur_score = lzx_explicit_offset_match_score(cur_len, cur_offset_data);
2452
2453                         /* Find the longest repeat offset match at this
2454                          * position.  If we find one and it's "better" than the
2455                          * explicit offset match we found, then go ahead and
2456                          * choose the repeat offset match immediately. */
2457                         best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2458                                                                             recent_offsets,
2459                                                                             max_len,
2460                                                                             &best_rep_idx);
2461                         in_next++;
2462
2463                         if (best_rep_len != 0 &&
2464                             (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2465                                                                        best_rep_idx)) >= cur_score)
2466                         {
2467                                 cur_len = best_rep_len;
2468                                 cur_offset_data = best_rep_idx;
2469                                 skip_len = best_rep_len - 1;
2470                                 goto choose_cur_match;
2471                         }
2472
2473                 have_cur_match:
2474                         /*
2475                          * We have a match at the current position.  If the
2476                          * match is very long, then choose it immediately.
2477                          * Otherwise, see if there's a better match at the next
2478                          * position.
2479                          */
2480
2481                         if (cur_len >= nice_len) {
2482                                 skip_len = cur_len - 1;
2483                                 goto choose_cur_match;
2484                         }
2485
2486                         if (unlikely(max_len > in_end - in_next)) {
2487                                 max_len = in_end - in_next;
2488                                 nice_len = min(max_len, nice_len);
2489                         }
2490
2491                         next_len = CALL_HC_MF(is_16_bit, c,
2492                                               hc_matchfinder_longest_match,
2493                                               in_begin,
2494                                               in_next - in_begin,
2495                                               cur_len - 2,
2496                                               max_len,
2497                                               nice_len,
2498                                               c->max_search_depth / 2,
2499                                               next_hashes,
2500                                               &next_offset);
2501
2502                         if (next_len <= cur_len - 2) {
2503                                 /* No potentially better match was found. */
2504                                 in_next++;
2505                                 skip_len = cur_len - 2;
2506                                 goto choose_cur_match;
2507                         }
2508
2509                         next_offset_data = next_offset + LZX_OFFSET_ADJUSTMENT;
2510                         next_score = lzx_explicit_offset_match_score(next_len, next_offset_data);
2511
2512                         best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2513                                                                             recent_offsets,
2514                                                                             max_len,
2515                                                                             &best_rep_idx);
2516                         in_next++;
2517
2518                         if (best_rep_len != 0 &&
2519                             (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2520                                                                        best_rep_idx)) >= next_score)
2521                         {
2522
2523                                 if (rep_score > cur_score) {
2524                                         /* The next match is better, and it's a
2525                                          * repeat offset match. */
2526                                         lzx_choose_literal(c, *(in_next - 2),
2527                                                            &litrunlen);
2528                                         cur_len = best_rep_len;
2529                                         cur_offset_data = best_rep_idx;
2530                                         skip_len = cur_len - 1;
2531                                         goto choose_cur_match;
2532                                 }
2533                         } else {
2534                                 if (next_score > cur_score) {
2535                                         /* The next match is better, and it's an
2536                                          * explicit offset match. */
2537                                         lzx_choose_literal(c, *(in_next - 2),
2538                                                            &litrunlen);
2539                                         cur_len = next_len;
2540                                         cur_offset_data = next_offset_data;
2541                                         cur_score = next_score;
2542                                         goto have_cur_match;
2543                                 }
2544                         }
2545
2546                         /* The original match was better; choose it. */
2547                         skip_len = cur_len - 2;
2548
2549                 choose_cur_match:
2550                         /* Choose a match and have the matchfinder skip over its
2551                          * remaining bytes. */
2552                         lzx_choose_match(c, cur_len, cur_offset_data,
2553                                          recent_offsets, is_16_bit,
2554                                          &litrunlen, &next_seq);
2555                         in_next = CALL_HC_MF(is_16_bit, c,
2556                                              hc_matchfinder_skip_positions,
2557                                              in_begin,
2558                                              in_next - in_begin,
2559                                              in_end - in_begin,
2560                                              skip_len,
2561                                              next_hashes);
2562
2563                         /* Keep going until it's time to end the block. */
2564                 } while (in_next < in_max_block_end &&
2565                          !lzx_should_end_block(&c->split_stats, in_block_begin,
2566                                                in_next, in_end));
2567
2568                 /* Flush the block. */
2569                 lzx_finish_sequence(next_seq, litrunlen);
2570                 lzx_flush_block(c, os, in_block_begin, in_next - in_block_begin, 0);
2571
2572                 /* Keep going until we've reached the end of the input buffer. */
2573         } while (in_next != in_end);
2574 }
2575
2576 static void
2577 lzx_compress_lazy_16(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2578                      struct lzx_output_bitstream *os)
2579 {
2580         lzx_compress_lazy(c, in, in_nbytes, os, true);
2581 }
2582
2583 static void
2584 lzx_compress_lazy_32(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2585                      struct lzx_output_bitstream *os)
2586 {
2587         lzx_compress_lazy(c, in, in_nbytes, os, false);
2588 }
2589
2590 /******************************************************************************/
2591 /*                          Compressor operations                             */
2592 /*----------------------------------------------------------------------------*/
2593
2594 /*
2595  * Generate tables for mapping match offsets (actually, "adjusted" match
2596  * offsets) to offset slots.
2597  */
2598 static void
2599 lzx_init_offset_slot_tabs(struct lzx_compressor *c)
2600 {
2601         u32 adjusted_offset = 0;
2602         unsigned slot = 0;
2603
2604         /* slots [0, 29] */
2605         for (; adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1);
2606              adjusted_offset++)
2607         {
2608                 if (adjusted_offset >= lzx_offset_slot_base[slot + 1])
2609                         slot++;
2610                 c->offset_slot_tab_1[adjusted_offset] = slot;
2611         }
2612
2613         /* slots [30, 49] */
2614         for (; adjusted_offset < LZX_MAX_WINDOW_SIZE;
2615              adjusted_offset += (u32)1 << 14)
2616         {
2617                 if (adjusted_offset >= lzx_offset_slot_base[slot + 1])
2618                         slot++;
2619                 c->offset_slot_tab_2[adjusted_offset >> 14] = slot;
2620         }
2621 }
2622
2623 static size_t
2624 lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level)
2625 {
2626         if (compression_level <= MAX_FAST_LEVEL) {
2627                 if (lzx_is_16_bit(max_bufsize))
2628                         return offsetof(struct lzx_compressor, hc_mf_16) +
2629                                hc_matchfinder_size_16(max_bufsize);
2630                 else
2631                         return offsetof(struct lzx_compressor, hc_mf_32) +
2632                                hc_matchfinder_size_32(max_bufsize);
2633         } else {
2634                 if (lzx_is_16_bit(max_bufsize))
2635                         return offsetof(struct lzx_compressor, bt_mf_16) +
2636                                bt_matchfinder_size_16(max_bufsize);
2637                 else
2638                         return offsetof(struct lzx_compressor, bt_mf_32) +
2639                                bt_matchfinder_size_32(max_bufsize);
2640         }
2641 }
2642
2643 /* Compute the amount of memory needed to allocate an LZX compressor. */
2644 static u64
2645 lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2646                       bool destructive)
2647 {
2648         u64 size = 0;
2649
2650         if (max_bufsize > LZX_MAX_WINDOW_SIZE)
2651                 return 0;
2652
2653         size += lzx_get_compressor_size(max_bufsize, compression_level);
2654         if (!destructive)
2655                 size += max_bufsize; /* account for in_buffer */
2656         return size;
2657 }
2658
2659 /* Allocate an LZX compressor. */
2660 static int
2661 lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
2662                       bool destructive, void **c_ret)
2663 {
2664         unsigned window_order;
2665         struct lzx_compressor *c;
2666
2667         /* Validate the maximum buffer size and get the window order from it. */
2668         window_order = lzx_get_window_order(max_bufsize);
2669         if (window_order == 0)
2670                 return WIMLIB_ERR_INVALID_PARAM;
2671
2672         /* Allocate the compressor. */
2673         c = MALLOC(lzx_get_compressor_size(max_bufsize, compression_level));
2674         if (!c)
2675                 goto oom0;
2676
2677         c->window_order = window_order;
2678         c->num_main_syms = lzx_get_num_main_syms(window_order);
2679         c->destructive = destructive;
2680
2681         /* Allocate the buffer for preprocessed data if needed. */
2682         if (!c->destructive) {
2683                 c->in_buffer = MALLOC(max_bufsize);
2684                 if (!c->in_buffer)
2685                         goto oom1;
2686         }
2687
2688         if (compression_level <= MAX_FAST_LEVEL) {
2689
2690                 /* Fast compression: Use lazy parsing. */
2691                 if (lzx_is_16_bit(max_bufsize))
2692                         c->impl = lzx_compress_lazy_16;
2693                 else
2694                         c->impl = lzx_compress_lazy_32;
2695
2696                 /* Scale max_search_depth and nice_match_length with the
2697                  * compression level. */
2698                 c->max_search_depth = (60 * compression_level) / 20;
2699                 c->nice_match_length = (80 * compression_level) / 20;
2700
2701                 /* lzx_compress_lazy() needs max_search_depth >= 2 because it
2702                  * halves the max_search_depth when attempting a lazy match, and
2703                  * max_search_depth must be at least 1. */
2704                 c->max_search_depth = max(c->max_search_depth, 2);
2705         } else {
2706
2707                 /* Normal / high compression: Use near-optimal parsing. */
2708                 if (lzx_is_16_bit(max_bufsize))
2709                         c->impl = lzx_compress_near_optimal_16;
2710                 else
2711                         c->impl = lzx_compress_near_optimal_32;
2712
2713                 /* Scale max_search_depth and nice_match_length with the
2714                  * compression level. */
2715                 c->max_search_depth = (24 * compression_level) / 50;
2716                 c->nice_match_length = (48 * compression_level) / 50;
2717
2718                 /* Also scale num_optim_passes with the compression level.  But
2719                  * the more passes there are, the less they help --- so don't
2720                  * add them linearly.  */
2721                 c->num_optim_passes = 1;
2722                 c->num_optim_passes += (compression_level >= 45);
2723                 c->num_optim_passes += (compression_level >= 70);
2724                 c->num_optim_passes += (compression_level >= 100);
2725                 c->num_optim_passes += (compression_level >= 150);
2726                 c->num_optim_passes += (compression_level >= 200);
2727                 c->num_optim_passes += (compression_level >= 300);
2728
2729                 /* max_search_depth must be at least 1. */
2730                 c->max_search_depth = max(c->max_search_depth, 1);
2731         }
2732
2733         /* Prepare the offset => offset slot mapping. */
2734         lzx_init_offset_slot_tabs(c);
2735
2736         *c_ret = c;
2737         return 0;
2738
2739 oom1:
2740         FREE(c);
2741 oom0:
2742         return WIMLIB_ERR_NOMEM;
2743 }
2744
2745 /* Compress a buffer of data. */
2746 static size_t
2747 lzx_compress(const void *restrict in, size_t in_nbytes,
2748              void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2749 {
2750         struct lzx_compressor *c = _c;
2751         struct lzx_output_bitstream os;
2752         size_t result;
2753
2754         /* Don't bother trying to compress very small inputs. */
2755         if (in_nbytes < 64)
2756                 return 0;
2757
2758         /* Copy the input data into the internal buffer if needed. */
2759         if (!c->destructive) {
2760                 memcpy(c->in_buffer, in, in_nbytes);
2761                 in = c->in_buffer;
2762         }
2763
2764         /* Preprocess the input data. */
2765         lzx_preprocess((void *)in, in_nbytes);
2766
2767         /* Initially, the previous Huffman codeword lengths are all zeroes. */
2768         c->codes_index = 0;
2769         memset(&c->codes[1].lens, 0, sizeof(struct lzx_lens));
2770
2771         /* Initialize the output bitstream. */
2772         lzx_init_output(&os, out, out_nbytes_avail);
2773
2774         /* Call the compression level-specific compress() function. */
2775         (*c->impl)(c, in, in_nbytes, &os);
2776
2777         /* Flush the output bitstream. */
2778         result = lzx_flush_output(&os);
2779
2780         /* If the data did not compress to less than its original size and we
2781          * preprocessed the original buffer, then postprocess it to restore it
2782          * to its original state. */
2783         if (result == 0 && c->destructive)
2784                 lzx_postprocess((void *)in, in_nbytes);
2785
2786         /* Return the number of compressed bytes, or 0 if the input did not
2787          * compress to less than its original size. */
2788         return result;
2789 }
2790
2791 /* Free an LZX compressor. */
2792 static void
2793 lzx_free_compressor(void *_c)
2794 {
2795         struct lzx_compressor *c = _c;
2796
2797         if (!c->destructive)
2798                 FREE(c->in_buffer);
2799         FREE(c);
2800 }
2801
2802 const struct compressor_ops lzx_compressor_ops = {
2803         .get_needed_memory  = lzx_get_needed_memory,
2804         .create_compressor  = lzx_create_compressor,
2805         .compress           = lzx_compress,
2806         .free_compressor    = lzx_free_compressor,
2807 };