wimlib.net Git - wimlib/blob - src/lzx-compress.c

   1 /*
   2  * lzx-compress.c
   3  *
   4  * LZX compression routines
   5  */
   6
   7 /*
   8  * Copyright (C) 2012, 2013 Eric Biggers
   9  *
  10  * This file is part of wimlib, a library for working with WIM files.
  11  *
  12  * wimlib is free software; you can redistribute it and/or modify it under the
  13  * terms of the GNU General Public License as published by the Free
  14  * Software Foundation; either version 3 of the License, or (at your option)
  15  * any later version.
  16  *
  17  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
  18  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  19  * A PARTICULAR PURPOSE. See the GNU General Public License for more
  20  * details.
  21  *
  22  * You should have received a copy of the GNU General Public License
  23  * along with wimlib; if not, see http://www.gnu.org/licenses/.
  24  */
  25
  26
  27 /*
  28  * This file contains a compressor for the LZX compression format, as used in
  29  * the WIM file format.
  30  *
  31  * Format
  32  * ======
  33  *
  34  * First, the primary reference for the LZX compression format is the
  35  * specification released by Microsoft.
  36  *
  37  * Second, the comments in lzx-decompress.c provide some more information about
  38  * the LZX compression format, including errors in the Microsoft specification.
  39  *
  40  * Do note that LZX shares many similarities with DEFLATE, the algorithm used by
  41  * zlib and gzip.  Both LZX and DEFLATE use LZ77 matching and Huffman coding,
  42  * and certain other details are quite similar, such as the method for storing
  43  * Huffman codes.  However, some of the main differences are:
  44  *
  45  * - LZX preprocesses the data to attempt to make x86 machine code slightly more
  46  *   compressible before attempting to compress it further.
  47  * - LZX uses a "main" alphabet which combines literals and matches, with the
  48  *   match symbols containing a "length header" (giving all or part of the match
  49  *   length) and a "position slot" (giving, roughly speaking, the order of
  50  *   magnitude of the match offset).
  51  * - LZX does not have static Huffman blocks; however it does have two types of
  52  *   dynamic Huffman blocks ("aligned offset" and "verbatim").
  53  * - LZX has a minimum match length of 2 rather than 3.
  54  * - In LZX, match offsets 0 through 2 actually represent entries in an LRU
  55  *   queue of match offsets.  This is very useful for certain types of files,
  56  *   such as binary files that have repeating records.
  57  *
  58  * Algorithms
  59  * ==========
  60  *
  61  * There are actually two distinct overall algorithms implemented here.  We
  62  * shall refer to them as the "slow" algorithm and the "fast" algorithm.  The
  63  * "slow" algorithm spends more time compressing to achieve a higher compression
  64  * ratio compared to the "fast" algorithm.  More details are presented below.
  65  *
  66  * Slow algorithm
  67  * --------------
  68  *
  69  * The "slow" algorithm to generate LZX-compressed data is roughly as follows:
  70  *
  71  * 1. Preprocess the input data to translate the targets of x86 call
  72  *    instructions to absolute offsets.
  73  *
  74  * 2. Build the suffix array and inverse suffix array for the input data.  The
  75  *    suffix array contains the indices of all suffixes of the input data,
  76  *    sorted lexcographically by the corresponding suffixes.  The "position" of
  77  *    a suffix is the index of that suffix in the original string, whereas the
  78  *    "rank" of a suffix is the index at which that suffix's position is found
  79  *    in the suffix array.
  80  *
  81  * 3. Build the longest common prefix array corresponding to the suffix array.
  82  *
  83  * 4. For each suffix, find the highest lower ranked suffix that has a lower
  84  *    position, the lowest higher ranked suffix that has a lower position, and
  85  *    the length of the common prefix shared between each.   This information is
  86  *    later used to link suffix ranks into a doubly-linked list for searching
  87  *    the suffix array.
  88  *
  89  * 5. Set a default cost model for matches/literals.
  90  *
  91  * 6. Determine the lowest cost sequence of LZ77 matches ((offset, length)
  92  *    pairs) and literal bytes to divide the input into.  Raw match-finding is
  93  *    done by searching the suffix array using a linked list to avoid
  94  *    considering any suffixes that start after the current position.  Each run
  95  *    of the match-finder returns the approximate lowest-cost longest match as
  96  *    well as any shorter matches that have even lower approximate costs.  Each
  97  *    such run also adds the suffix rank of the current position into the linked
  98  *    list being used to search the suffix array.  Parsing, or match-choosing,
  99  *    is solved as a minimum-cost path problem using a forward "optimal parsing"
 100  *    algorithm based on the Deflate encoder from 7-Zip.  This algorithm moves
 101  *    forward calculating the minimum cost to reach each byte until either a
 102  *    very long match is found or until a position is found at which no matches
 103  *    start or overlap.
 104  *
 105  * 7. Build the Huffman codes needed to output the matches/literals.
 106  *
 107  * 8. Up to a certain number of iterations, use the resulting Huffman codes to
 108  *    refine a cost model and go back to Step #6 to determine an improved
 109  *    sequence of matches and literals.
 110  *
 111  * 9. Output the resulting block using the match/literal sequences and the
 112  *    Huffman codes that were computed for the block.
 113  *
 114  * Note: the algorithm does not yet attempt to split the input into multiple LZX
 115  * blocks, instead using a series of blocks of LZX_DIV_BLOCK_SIZE bytes.
 116  *
 117  * Fast algorithm
 118  * --------------
 119  *
 120  * The fast algorithm (and the only one available in wimlib v1.5.1 and earlier)
 121  * spends much less time on the main bottlenecks of the compression process ---
 122  * that is, the match finding and match choosing.  Matches are found and chosen
 123  * with hash chains using a greedy parse with one position of look-ahead.  No
 124  * block splitting is done; only compressing the full input into an aligned
 125  * offset block is considered.
 126  *
 127  * Acknowledgments
 128  * ===============
 129  *
 130  * Acknowledgments to several open-source projects and research papers that made
 131  * it possible to implement this code:
 132  *
 133  * - divsufsort (author: Yuta Mori), for the suffix array construction code,
 134  *   located in a separate directory (divsufsort/).
 135  *
 136  * - "Linear-Time Longest-Common-Prefix Computation in Suffix Arrays and Its
 137  *   Applications" (Kasai et al. 2001), for the LCP array computation.
 138  *
 139  * - "LPF computation revisited" (Crochemore et al. 2009) for the prev and next
 140  *   array computations.
 141  *
 142  * - 7-Zip (author: Igor Pavlov) for the algorithm for forward optimal parsing
 143  *   (match-choosing).
 144  *
 145  * - zlib (author: Jean-loup Gailly and Mark Adler), for the hash table
 146  *   match-finding algorithm (used in lz77.c).
 147  *
 148  * - lzx-compress (author: Matthew T. Russotto), on which some parts of this
 149  *   code were originally based.
 150  */
 151
 152 #ifdef HAVE_CONFIG_H
 153 #  include "config.h"
 154 #endif
 155
 156 #include "wimlib.h"
 157 #include "wimlib/compressor_ops.h"
 158 #include "wimlib/compress_common.h"
 159 #include "wimlib/endianness.h"
 160 #include "wimlib/error.h"
 161 #include "wimlib/lz_hash.h"
 162 #include "wimlib/lz_sarray.h"
 163 #include "wimlib/lzx.h"
 164 #include "wimlib/util.h"
 165 #include <pthread.h>
 166 #include <math.h>
 167 #include <string.h>
 168
 169 #ifdef ENABLE_LZX_DEBUG
 170 #  include "wimlib/decompress_common.h"
 171 #endif
 172
 173 typedef u32 block_cost_t;
 174 #define INFINITE_BLOCK_COST     ((block_cost_t)~0U)
 175
 176 #define LZX_OPTIM_ARRAY_SIZE    4096
 177
 178 #define LZX_DIV_BLOCK_SIZE      32768
 179
 180 #define LZX_MAX_CACHE_PER_POS   10
 181
 182 /* Codewords for the LZX main, length, and aligned offset Huffman codes  */
 183 struct lzx_codewords {
 184         u16 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 185         u16 len[LZX_LENCODE_NUM_SYMBOLS];
 186         u16 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 187 };
 188
 189 /* Codeword lengths (in bits) for the LZX main, length, and aligned offset
 190  * Huffman codes.
 191  *
 192  * A 0 length means the codeword has zero frequency.
 193  */
 194 struct lzx_lens {
 195         u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 196         u8 len[LZX_LENCODE_NUM_SYMBOLS];
 197         u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 198 };
 199
 200 /* Costs for the LZX main, length, and aligned offset Huffman symbols.
 201  *
 202  * If a codeword has zero frequency, it must still be assigned some nonzero cost
 203  * --- generally a high cost, since even if it gets used in the next iteration,
 204  * it probably will not be used very times.  */
 205 struct lzx_costs {
 206         u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 207         u8 len[LZX_LENCODE_NUM_SYMBOLS];
 208         u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 209 };
 210
 211 /* The LZX main, length, and aligned offset Huffman codes  */
 212 struct lzx_codes {
 213         struct lzx_codewords codewords;
 214         struct lzx_lens lens;
 215 };
 216
 217 /* Tables for tallying symbol frequencies in the three LZX alphabets  */
 218 struct lzx_freqs {
 219         input_idx_t main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 220         input_idx_t len[LZX_LENCODE_NUM_SYMBOLS];
 221         input_idx_t aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 222 };
 223
 224 /* LZX intermediate match/literal format  */
 225 struct lzx_match {
 226         /* Bit     Description
 227          *
 228          * 31      1 if a match, 0 if a literal.
 229          *
 230          * 30-25   position slot.  This can be at most 50, so it will fit in 6
 231          *         bits.
 232          *
 233          * 8-24    position footer.  This is the offset of the real formatted
 234          *         offset from the position base.  This can be at most 17 bits
 235          *         (since lzx_extra_bits[LZX_MAX_POSITION_SLOTS - 1] is 17).
 236          *
 237          * 0-7     length of match, minus 2.  This can be at most
 238          *         (LZX_MAX_MATCH_LEN - 2) == 255, so it will fit in 8 bits.  */
 239         u32 data;
 240 };
 241
 242 /* Specification for an LZX block.  */
 243 struct lzx_block_spec {
 244
 245         /* One of the LZX_BLOCKTYPE_* constants indicating which type of this
 246          * block.  */
 247         int block_type;
 248
 249         /* 0-based position in the window at which this block starts.  */
 250         input_idx_t window_pos;
 251
 252         /* The number of bytes of uncompressed data this block represents.  */
 253         input_idx_t block_size;
 254
 255         /* The position in the 'chosen_matches' array in the `struct
 256          * lzx_compressor' at which the match/literal specifications for
 257          * this block begin.  */
 258         input_idx_t chosen_matches_start_pos;
 259
 260         /* The number of match/literal specifications for this block.  */
 261         input_idx_t num_chosen_matches;
 262
 263         /* Huffman codes for this block.  */
 264         struct lzx_codes codes;
 265 };
 266
 267 /*
 268  * An array of these structures is used during the match-choosing algorithm.
 269  * They correspond to consecutive positions in the window and are used to keep
 270  * track of the cost to reach each position, and the match/literal choices that
 271  * need to be chosen to reach that position.
 272  */
 273 struct lzx_optimal {
 274         /* The approximate minimum cost, in bits, to reach this position in the
 275          * window which has been found so far.  */
 276         block_cost_t cost;
 277
 278         /* The union here is just for clarity, since the fields are used in two
 279          * slightly different ways.  Initially, the @prev structure is filled in
 280          * first, and links go from later in the window to earlier in the
 281          * window.  Later, @next structure is filled in and links go from
 282          * earlier in the window to later in the window.  */
 283         union {
 284                 struct {
 285                         /* Position of the start of the match or literal that
 286                          * was taken to get to this position in the approximate
 287                          * minimum-cost parse.  */
 288                         input_idx_t link;
 289
 290                         /* Offset (as in an LZ (length, offset) pair) of the
 291                          * match or literal that was taken to get to this
 292                          * position in the approximate minimum-cost parse.  */
 293                         input_idx_t match_offset;
 294                 } prev;
 295                 struct {
 296                         /* Position at which the match or literal starting at
 297                          * this position ends in the minimum-cost parse.  */
 298                         input_idx_t link;
 299
 300                         /* Offset (as in an LZ (length, offset) pair) of the
 301                          * match or literal starting at this position in the
 302                          * approximate minimum-cost parse.  */
 303                         input_idx_t match_offset;
 304                 } next;
 305         };
 306
 307         /* The match offset LRU queue that will exist when the approximate
 308          * minimum-cost path to reach this position is taken.  */
 309         struct lzx_lru_queue queue;
 310 };
 311
 312 /* State of the LZX compressor.  */
 313 struct lzx_compressor {
 314
 315         /* The parameters that were used to create the compressor.  */
 316         struct wimlib_lzx_compressor_params params;
 317
 318         /* The buffer of data to be compressed.
 319          *
 320          * 0xe8 byte preprocessing is done directly on the data here before
 321          * further compression.
 322          *
 323          * Note that this compressor does *not* use a real sliding window!!!!
 324          * It's not needed in the WIM format, since every chunk is compressed
 325          * independently.  This is by design, to allow random access to the
 326          * chunks.
 327          *
 328          * We reserve a few extra bytes to potentially allow reading off the end
 329          * of the array in the match-finding code for optimization purposes.
 330          */
 331         u8 *window;
 332
 333         /* Number of bytes of data to be compressed, which is the number of
 334          * bytes of data in @window that are actually valid.  */
 335         input_idx_t window_size;
 336
 337         /* Allocated size of the @window.  */
 338         input_idx_t max_window_size;
 339
 340         /* Number of symbols in the main alphabet (depends on the
 341          * @max_window_size since it determines the maximum allowed offset).  */
 342         unsigned num_main_syms;
 343
 344         /* The current match offset LRU queue.  */
 345         struct lzx_lru_queue queue;
 346
 347         /* Space for the sequences of matches/literals that were chosen for each
 348          * block.  */
 349         struct lzx_match *chosen_matches;
 350
 351         /* Information about the LZX blocks the preprocessed input was divided
 352          * into.  */
 353         struct lzx_block_spec *block_specs;
 354
 355         /* Number of LZX blocks the input was divided into; a.k.a. the number of
 356          * elements of @block_specs that are valid.  */
 357         unsigned num_blocks;
 358
 359         /* This is simply filled in with zeroes and used to avoid special-casing
 360          * the output of the first compressed Huffman code, which conceptually
 361          * has a delta taken from a code with all symbols having zero-length
 362          * codewords.  */
 363         struct lzx_codes zero_codes;
 364
 365         /* The current cost model.  */
 366         struct lzx_costs costs;
 367
 368         /* Fast algorithm only:  Array of hash table links.  */
 369         input_idx_t *prev_tab;
 370
 371         /* Slow algorithm only: Suffix array match-finder.  */
 372         struct lz_sarray lz_sarray;
 373
 374         /* Position in window of next match to return.  */
 375         input_idx_t match_window_pos;
 376
 377         /* The match-finder shall ensure the length of matches does not exceed
 378          * this position in the input.  */
 379         input_idx_t match_window_end;
 380
 381         /* Matches found by the match-finder are cached in the following array
 382          * to achieve a slight speedup when the same matches are needed on
 383          * subsequent passes.  This is suboptimal because different matches may
 384          * be preferred with different cost models, but seems to be a worthwhile
 385          * speedup.  */
 386         struct raw_match *cached_matches;
 387         unsigned cached_matches_pos;
 388         bool matches_cached;
 389
 390         /* Slow algorithm only: Temporary space used for match-choosing
 391          * algorithm.
 392          *
 393          * The size of this array must be at least LZX_MAX_MATCH_LEN but
 394          * otherwise is arbitrary.  More space simply allows the match-choosing
 395          * algorithm to potentially find better matches (depending on the input,
 396          * as always).  */
 397         struct lzx_optimal *optimum;
 398
 399         /* Slow algorithm only: Variables used by the match-choosing algorithm.
 400          *
 401          * When matches have been chosen, optimum_cur_idx is set to the position
 402          * in the window of the next match/literal to return and optimum_end_idx
 403          * is set to the position in the window at the end of the last
 404          * match/literal to return.  */
 405         u32 optimum_cur_idx;
 406         u32 optimum_end_idx;
 407 };
 408
 409 /* Returns the LZX position slot that corresponds to a given match offset,
 410  * taking into account the recent offset queue and updating it if the offset is
 411  * found in it.  */
 412 static unsigned
 413 lzx_get_position_slot(unsigned offset, struct lzx_lru_queue *queue)
 414 {
 415         unsigned position_slot;
 416
 417         /* See if the offset was recently used.  */
 418         for (unsigned i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
 419                 if (offset == queue->R[i]) {
 420                         /* Found it.  */
 421
 422                         /* Bring the repeat offset to the front of the
 423                          * queue.  Note: this is, in fact, not a real
 424                          * LRU queue because repeat matches are simply
 425                          * swapped to the front.  */
 426                         swap(queue->R[0], queue->R[i]);
 427
 428                         /* The resulting position slot is simply the first index
 429                          * at which the offset was found in the queue.  */
 430                         return i;
 431                 }
 432         }
 433
 434         /* The offset was not recently used; look up its real position slot.  */
 435         position_slot = lzx_get_position_slot_raw(offset + LZX_OFFSET_OFFSET);
 436
 437         /* Bring the new offset to the front of the queue.  */
 438         for (unsigned i = LZX_NUM_RECENT_OFFSETS - 1; i > 0; i--)
 439                 queue->R[i] = queue->R[i - 1];
 440         queue->R[0] = offset;
 441
 442         return position_slot;
 443 }
 444
 445 /* Build the main, length, and aligned offset Huffman codes used in LZX.
 446  *
 447  * This takes as input the frequency tables for each code and produces as output
 448  * a set of tables that map symbols to codewords and codeword lengths.  */
 449 static void
 450 lzx_make_huffman_codes(const struct lzx_freqs *freqs,
 451                        struct lzx_codes *codes,
 452                        unsigned num_main_syms)
 453 {
 454         make_canonical_huffman_code(num_main_syms,
 455                                     LZX_MAX_MAIN_CODEWORD_LEN,
 456                                     freqs->main,
 457                                     codes->lens.main,
 458                                     codes->codewords.main);
 459
 460         make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
 461                                     LZX_MAX_LEN_CODEWORD_LEN,
 462                                     freqs->len,
 463                                     codes->lens.len,
 464                                     codes->codewords.len);
 465
 466         make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
 467                                     LZX_MAX_ALIGNED_CODEWORD_LEN,
 468                                     freqs->aligned,
 469                                     codes->lens.aligned,
 470                                     codes->codewords.aligned);
 471 }
 472
 473 /*
 474  * Output an LZX match.
 475  *
 476  * @out:         The bitstream to write the match to.
 477  * @block_type:  The type of the LZX block (LZX_BLOCKTYPE_ALIGNED or LZX_BLOCKTYPE_VERBATIM)
 478  * @match:       The match.
 479  * @codes:       Pointer to a structure that contains the codewords for the
 480  *               main, length, and aligned offset Huffman codes.
 481  */
 482 static void
 483 lzx_write_match(struct output_bitstream *out, int block_type,
 484                 struct lzx_match match, const struct lzx_codes *codes)
 485 {
 486         /* low 8 bits are the match length minus 2 */
 487         unsigned match_len_minus_2 = match.data & 0xff;
 488         /* Next 17 bits are the position footer */
 489         unsigned position_footer = (match.data >> 8) & 0x1ffff; /* 17 bits */
 490         /* Next 6 bits are the position slot. */
 491         unsigned position_slot = (match.data >> 25) & 0x3f;     /* 6 bits */
 492         unsigned len_header;
 493         unsigned len_footer;
 494         unsigned main_symbol;
 495         unsigned num_extra_bits;
 496         unsigned verbatim_bits;
 497         unsigned aligned_bits;
 498
 499         /* If the match length is less than MIN_MATCH_LEN (= 2) +
 500          * NUM_PRIMARY_LENS (= 7), the length header contains
 501          * the match length minus MIN_MATCH_LEN, and there is no
 502          * length footer.
 503          *
 504          * Otherwise, the length header contains
 505          * NUM_PRIMARY_LENS, and the length footer contains
 506          * the match length minus NUM_PRIMARY_LENS minus
 507          * MIN_MATCH_LEN. */
 508         if (match_len_minus_2 < LZX_NUM_PRIMARY_LENS) {
 509                 len_header = match_len_minus_2;
 510                 /* No length footer-- mark it with a special
 511                  * value. */
 512                 len_footer = (unsigned)(-1);
 513         } else {
 514                 len_header = LZX_NUM_PRIMARY_LENS;
 515                 len_footer = match_len_minus_2 - LZX_NUM_PRIMARY_LENS;
 516         }
 517
 518         /* Combine the position slot with the length header into a single symbol
 519          * that will be encoded with the main code.
 520          *
 521          * The actual main symbol is offset by LZX_NUM_CHARS because values
 522          * under LZX_NUM_CHARS are used to indicate a literal byte rather than a
 523          * match.  */
 524         main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
 525
 526         /* Output main symbol. */
 527         bitstream_put_bits(out, codes->codewords.main[main_symbol],
 528                            codes->lens.main[main_symbol]);
 529
 530         /* If there is a length footer, output it using the
 531          * length Huffman code. */
 532         if (len_footer != (unsigned)(-1)) {
 533                 bitstream_put_bits(out, codes->codewords.len[len_footer],
 534                                    codes->lens.len[len_footer]);
 535         }
 536
 537         num_extra_bits = lzx_get_num_extra_bits(position_slot);
 538
 539         /* For aligned offset blocks with at least 3 extra bits, output the
 540          * verbatim bits literally, then the aligned bits encoded using the
 541          * aligned offset code.  Otherwise, only the verbatim bits need to be
 542          * output. */
 543         if ((block_type == LZX_BLOCKTYPE_ALIGNED) && (num_extra_bits >= 3)) {
 544
 545                 verbatim_bits = position_footer >> 3;
 546                 bitstream_put_bits(out, verbatim_bits,
 547                                    num_extra_bits - 3);
 548
 549                 aligned_bits = (position_footer & 7);
 550                 bitstream_put_bits(out,
 551                                    codes->codewords.aligned[aligned_bits],
 552                                    codes->lens.aligned[aligned_bits]);
 553         } else {
 554                 /* verbatim bits is the same as the position
 555                  * footer, in this case. */
 556                 bitstream_put_bits(out, position_footer, num_extra_bits);
 557         }
 558 }
 559
 560 static unsigned
 561 lzx_build_precode(const u8 lens[restrict],
 562                   const u8 prev_lens[restrict],
 563                   const unsigned num_syms,
 564                   input_idx_t precode_freqs[restrict LZX_PRECODE_NUM_SYMBOLS],
 565                   u8 output_syms[restrict num_syms],
 566                   u8 precode_lens[restrict LZX_PRECODE_NUM_SYMBOLS],
 567                   u16 precode_codewords[restrict LZX_PRECODE_NUM_SYMBOLS],
 568                   unsigned *num_additional_bits_ret)
 569 {
 570         memset(precode_freqs, 0,
 571                LZX_PRECODE_NUM_SYMBOLS * sizeof(precode_freqs[0]));
 572
 573         /* Since the code word lengths use a form of RLE encoding, the goal here
 574          * is to find each run of identical lengths when going through them in
 575          * symbol order (including runs of length 1).  For each run, as many
 576          * lengths are encoded using RLE as possible, and the rest are output
 577          * literally.
 578          *
 579          * output_syms[] will be filled in with the length symbols that will be
 580          * output, including RLE codes, not yet encoded using the precode.
 581          *
 582          * cur_run_len keeps track of how many code word lengths are in the
 583          * current run of identical lengths.  */
 584         unsigned output_syms_idx = 0;
 585         unsigned cur_run_len = 1;
 586         unsigned num_additional_bits = 0;
 587         for (unsigned i = 1; i <= num_syms; i++) {
 588
 589                 if (i != num_syms && lens[i] == lens[i - 1]) {
 590                         /* Still in a run--- keep going. */
 591                         cur_run_len++;
 592                         continue;
 593                 }
 594
 595                 /* Run ended! Check if it is a run of zeroes or a run of
 596                  * nonzeroes. */
 597
 598                 /* The symbol that was repeated in the run--- not to be confused
 599                  * with the length *of* the run (cur_run_len) */
 600                 unsigned len_in_run = lens[i - 1];
 601
 602                 if (len_in_run == 0) {
 603                         /* A run of 0's.  Encode it in as few length
 604                          * codes as we can. */
 605
 606                         /* The magic length 18 indicates a run of 20 + n zeroes,
 607                          * where n is an uncompressed literal 5-bit integer that
 608                          * follows the magic length. */
 609                         while (cur_run_len >= 20) {
 610                                 unsigned additional_bits;
 611
 612                                 additional_bits = min(cur_run_len - 20, 0x1f);
 613                                 num_additional_bits += 5;
 614                                 precode_freqs[18]++;
 615                                 output_syms[output_syms_idx++] = 18;
 616                                 output_syms[output_syms_idx++] = additional_bits;
 617                                 cur_run_len -= 20 + additional_bits;
 618                         }
 619
 620                         /* The magic length 17 indicates a run of 4 + n zeroes,
 621                          * where n is an uncompressed literal 4-bit integer that
 622                          * follows the magic length. */
 623                         while (cur_run_len >= 4) {
 624                                 unsigned additional_bits;
 625
 626                                 additional_bits = min(cur_run_len - 4, 0xf);
 627                                 num_additional_bits += 4;
 628                                 precode_freqs[17]++;
 629                                 output_syms[output_syms_idx++] = 17;
 630                                 output_syms[output_syms_idx++] = additional_bits;
 631                                 cur_run_len -= 4 + additional_bits;
 632                         }
 633
 634                 } else {
 635
 636                         /* A run of nonzero lengths. */
 637
 638                         /* The magic length 19 indicates a run of 4 + n
 639                          * nonzeroes, where n is a literal bit that follows the
 640                          * magic length, and where the value of the lengths in
 641                          * the run is given by an extra length symbol, encoded
 642                          * with the precode, that follows the literal bit.
 643                          *
 644                          * The extra length symbol is encoded as a difference
 645                          * from the length of the codeword for the first symbol
 646                          * in the run in the previous code.
 647                          * */
 648                         while (cur_run_len >= 4) {
 649                                 unsigned additional_bits;
 650                                 signed char delta;
 651
 652                                 additional_bits = (cur_run_len > 4);
 653                                 num_additional_bits += 1;
 654                                 delta = (signed char)prev_lens[i - cur_run_len] -
 655                                         (signed char)len_in_run;
 656                                 if (delta < 0)
 657                                         delta += 17;
 658                                 precode_freqs[19]++;
 659                                 precode_freqs[(unsigned char)delta]++;
 660                                 output_syms[output_syms_idx++] = 19;
 661                                 output_syms[output_syms_idx++] = additional_bits;
 662                                 output_syms[output_syms_idx++] = delta;
 663                                 cur_run_len -= 4 + additional_bits;
 664                         }
 665                 }
 666
 667                 /* Any remaining lengths in the run are outputted without RLE,
 668                  * as a difference from the length of that codeword in the
 669                  * previous code. */
 670                 while (cur_run_len > 0) {
 671                         signed char delta;
 672
 673                         delta = (signed char)prev_lens[i - cur_run_len] -
 674                                 (signed char)len_in_run;
 675                         if (delta < 0)
 676                                 delta += 17;
 677
 678                         precode_freqs[(unsigned char)delta]++;
 679                         output_syms[output_syms_idx++] = delta;
 680                         cur_run_len--;
 681                 }
 682
 683                 cur_run_len = 1;
 684         }
 685
 686         /* Build the precode from the frequencies of the length symbols. */
 687
 688         make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS,
 689                                     LZX_MAX_PRE_CODEWORD_LEN,
 690                                     precode_freqs, precode_lens,
 691                                     precode_codewords);
 692
 693         *num_additional_bits_ret = num_additional_bits;
 694
 695         return output_syms_idx;
 696 }
 697
 698 /*
 699  * Writes a compressed Huffman code to the output, preceded by the precode for
 700  * it.
 701  *
 702  * The Huffman code is represented in the output as a series of path lengths
 703  * from which the canonical Huffman code can be reconstructed.  The path lengths
 704  * themselves are compressed using a separate Huffman code, the precode, which
 705  * consists of LZX_PRECODE_NUM_SYMBOLS (= 20) symbols that cover all possible
 706  * code lengths, plus extra codes for repeated lengths.  The path lengths of the
 707  * precode precede the path lengths of the larger code and are uncompressed,
 708  * consisting of 20 entries of 4 bits each.
 709  *
 710  * @out:                Bitstream to write the code to.
 711  * @lens:               The code lengths for the Huffman code, indexed by symbol.
 712  * @prev_lens:          Code lengths for this Huffman code, indexed by symbol,
 713  *                      in the *previous block*, or all zeroes if this is the
 714  *                      first block.
 715  * @num_syms:           The number of symbols in the code.
 716  */
 717 static void
 718 lzx_write_compressed_code(struct output_bitstream *out,
 719                           const u8 lens[restrict],
 720                           const u8 prev_lens[restrict],
 721                           unsigned num_syms)
 722 {
 723         input_idx_t precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
 724         u8 output_syms[num_syms];
 725         u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
 726         u16 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
 727         unsigned i;
 728         unsigned num_output_syms;
 729         u8 precode_sym;
 730         unsigned dummy;
 731
 732         num_output_syms = lzx_build_precode(lens,
 733                                             prev_lens,
 734                                             num_syms,
 735                                             precode_freqs,
 736                                             output_syms,
 737                                             precode_lens,
 738                                             precode_codewords,
 739                                             &dummy);
 740
 741         /* Write the lengths of the precode codes to the output. */
 742         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
 743                 bitstream_put_bits(out, precode_lens[i],
 744                                    LZX_PRECODE_ELEMENT_SIZE);
 745
 746         /* Write the length symbols, encoded with the precode, to the output. */
 747
 748         for (i = 0; i < num_output_syms; ) {
 749                 precode_sym = output_syms[i++];
 750
 751                 bitstream_put_bits(out, precode_codewords[precode_sym],
 752                                    precode_lens[precode_sym]);
 753                 switch (precode_sym) {
 754                 case 17:
 755                         bitstream_put_bits(out, output_syms[i++], 4);
 756                         break;
 757                 case 18:
 758                         bitstream_put_bits(out, output_syms[i++], 5);
 759                         break;
 760                 case 19:
 761                         bitstream_put_bits(out, output_syms[i++], 1);
 762                         bitstream_put_bits(out,
 763                                            precode_codewords[output_syms[i]],
 764                                            precode_lens[output_syms[i]]);
 765                         i++;
 766                         break;
 767                 default:
 768                         break;
 769                 }
 770         }
 771 }
 772
 773 /*
 774  * Writes all compressed matches and literal bytes in an LZX block to the the
 775  * output bitstream.
 776  *
 777  * @ostream
 778  *      The output bitstream.
 779  * @block_type
 780  *      The type of the block (LZX_BLOCKTYPE_ALIGNED or LZX_BLOCKTYPE_VERBATIM).
 781  * @match_tab
 782  *      The array of matches/literals that will be output (length @match_count).
 783  * @match_count
 784  *      Number of matches/literals to be output.
 785  * @codes
 786  *      Pointer to a structure that contains the codewords for the main, length,
 787  *      and aligned offset Huffman codes.
 788  */
 789 static void
 790 lzx_write_matches_and_literals(struct output_bitstream *ostream,
 791                                int block_type,
 792                                const struct lzx_match match_tab[],
 793                                unsigned match_count,
 794                                const struct lzx_codes *codes)
 795 {
 796         for (unsigned i = 0; i < match_count; i++) {
 797                 struct lzx_match match = match_tab[i];
 798
 799                 /* High bit of the match indicates whether the match is an
 800                  * actual match (1) or a literal uncompressed byte (0)  */
 801                 if (match.data & 0x80000000) {
 802                         /* match */
 803                         lzx_write_match(ostream, block_type,
 804                                         match, codes);
 805                 } else {
 806                         /* literal byte */
 807                         bitstream_put_bits(ostream,
 808                                            codes->codewords.main[match.data],
 809                                            codes->lens.main[match.data]);
 810                 }
 811         }
 812 }
 813
 814 static void
 815 lzx_assert_codes_valid(const struct lzx_codes * codes, unsigned num_main_syms)
 816 {
 817 #ifdef ENABLE_LZX_DEBUG
 818         unsigned i;
 819
 820         for (i = 0; i < num_main_syms; i++)
 821                 LZX_ASSERT(codes->lens.main[i] <= LZX_MAX_MAIN_CODEWORD_LEN);
 822
 823         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
 824                 LZX_ASSERT(codes->lens.len[i] <= LZX_MAX_LEN_CODEWORD_LEN);
 825
 826         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
 827                 LZX_ASSERT(codes->lens.aligned[i] <= LZX_MAX_ALIGNED_CODEWORD_LEN);
 828
 829         const unsigned tablebits = 10;
 830         u16 decode_table[(1 << tablebits) +
 831                          (2 * max(num_main_syms, LZX_LENCODE_NUM_SYMBOLS))]
 832                          _aligned_attribute(DECODE_TABLE_ALIGNMENT);
 833         LZX_ASSERT(0 == make_huffman_decode_table(decode_table,
 834                                                   num_main_syms,
 835                                                   min(tablebits, LZX_MAINCODE_TABLEBITS),
 836                                                   codes->lens.main,
 837                                                   LZX_MAX_MAIN_CODEWORD_LEN));
 838         LZX_ASSERT(0 == make_huffman_decode_table(decode_table,
 839                                                   LZX_LENCODE_NUM_SYMBOLS,
 840                                                   min(tablebits, LZX_LENCODE_TABLEBITS),
 841                                                   codes->lens.len,
 842                                                   LZX_MAX_LEN_CODEWORD_LEN));
 843         LZX_ASSERT(0 == make_huffman_decode_table(decode_table,
 844                                                   LZX_ALIGNEDCODE_NUM_SYMBOLS,
 845                                                   min(tablebits, LZX_ALIGNEDCODE_TABLEBITS),
 846                                                   codes->lens.aligned,
 847                                                   LZX_MAX_ALIGNED_CODEWORD_LEN));
 848 #endif /* ENABLE_LZX_DEBUG */
 849 }
 850
 851 /* Write an LZX aligned offset or verbatim block to the output.  */
 852 static void
 853 lzx_write_compressed_block(int block_type,
 854                            unsigned block_size,
 855                            unsigned max_window_size,
 856                            unsigned num_main_syms,
 857                            struct lzx_match * chosen_matches,
 858                            unsigned num_chosen_matches,
 859                            const struct lzx_codes * codes,
 860                            const struct lzx_codes * prev_codes,
 861                            struct output_bitstream * ostream)
 862 {
 863         unsigned i;
 864
 865         LZX_ASSERT(block_type == LZX_BLOCKTYPE_ALIGNED ||
 866                    block_type == LZX_BLOCKTYPE_VERBATIM);
 867         lzx_assert_codes_valid(codes, num_main_syms);
 868
 869         /* The first three bits indicate the type of block and are one of the
 870          * LZX_BLOCKTYPE_* constants.  */
 871         bitstream_put_bits(ostream, block_type, 3);
 872
 873         /* Output the block size.
 874          *
 875          * The original LZX format seemed to always encode the block size in 3
 876          * bytes.  However, the implementation in WIMGAPI, as used in WIM files,
 877          * uses the first bit to indicate whether the block is the default size
 878          * (32768) or a different size given explicitly by the next 16 bits.
 879          *
 880          * By default, this compressor uses a window size of 32768 and therefore
 881          * follows the WIMGAPI behavior.  However, this compressor also supports
 882          * window sizes greater than 32768 bytes, which do not appear to be
 883          * supported by WIMGAPI.  In such cases, we retain the default size bit
 884          * to mean a size of 32768 bytes but output non-default block size in 24
 885          * bits rather than 16.  The compatibility of this behavior is unknown
 886          * because WIMs created with chunk size greater than 32768 can seemingly
 887          * only be opened by wimlib anyway.  */
 888         if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
 889                 bitstream_put_bits(ostream, 1, 1);
 890         } else {
 891                 bitstream_put_bits(ostream, 0, 1);
 892
 893                 if (max_window_size >= 65536)
 894                         bitstream_put_bits(ostream, block_size >> 16, 8);
 895
 896                 bitstream_put_bits(ostream, block_size, 16);
 897         }
 898
 899         /* Write out lengths of the main code. Note that the LZX specification
 900          * incorrectly states that the aligned offset code comes after the
 901          * length code, but in fact it is the very first code to be written
 902          * (before the main code).  */
 903         if (block_type == LZX_BLOCKTYPE_ALIGNED)
 904                 for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
 905                         bitstream_put_bits(ostream, codes->lens.aligned[i],
 906                                            LZX_ALIGNEDCODE_ELEMENT_SIZE);
 907
 908         LZX_DEBUG("Writing main code...");
 909
 910         /* Write the precode and lengths for the first LZX_NUM_CHARS symbols in
 911          * the main code, which are the codewords for literal bytes.  */
 912         lzx_write_compressed_code(ostream,
 913                                   codes->lens.main,
 914                                   prev_codes->lens.main,
 915                                   LZX_NUM_CHARS);
 916
 917         /* Write the precode and lengths for the rest of the main code, which
 918          * are the codewords for match headers.  */
 919         lzx_write_compressed_code(ostream,
 920                                   codes->lens.main + LZX_NUM_CHARS,
 921                                   prev_codes->lens.main + LZX_NUM_CHARS,
 922                                   num_main_syms - LZX_NUM_CHARS);
 923
 924         LZX_DEBUG("Writing length code...");
 925
 926         /* Write the precode and lengths for the length code.  */
 927         lzx_write_compressed_code(ostream,
 928                                   codes->lens.len,
 929                                   prev_codes->lens.len,
 930                                   LZX_LENCODE_NUM_SYMBOLS);
 931
 932         LZX_DEBUG("Writing matches and literals...");
 933
 934         /* Write the actual matches and literals.  */
 935         lzx_write_matches_and_literals(ostream, block_type,
 936                                        chosen_matches, num_chosen_matches,
 937                                        codes);
 938
 939         LZX_DEBUG("Done writing block.");
 940 }
 941
 942 /* Write out the LZX blocks that were computed.  */
 943 static void
 944 lzx_write_all_blocks(struct lzx_compressor *ctx, struct output_bitstream *ostream)
 945 {
 946
 947         const struct lzx_codes *prev_codes = &ctx->zero_codes;
 948         for (unsigned i = 0; i < ctx->num_blocks; i++) {
 949                 const struct lzx_block_spec *spec = &ctx->block_specs[i];
 950
 951                 LZX_DEBUG("Writing block %u/%u (type=%d, size=%u, num_chosen_matches=%u)...",
 952                           i + 1, ctx->num_blocks,
 953                           spec->block_type, spec->block_size,
 954                           spec->num_chosen_matches);
 955
 956                 lzx_write_compressed_block(spec->block_type,
 957                                            spec->block_size,
 958                                            ctx->max_window_size,
 959                                            ctx->num_main_syms,
 960                                            &ctx->chosen_matches[spec->chosen_matches_start_pos],
 961                                            spec->num_chosen_matches,
 962                                            &spec->codes,
 963                                            prev_codes,
 964                                            ostream);
 965
 966                 prev_codes = &spec->codes;
 967         }
 968 }
 969
 970 /* Constructs an LZX match from a literal byte and updates the main code symbol
 971  * frequencies.  */
 972 static u32
 973 lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
 974 {
 975         freqs->main[lit]++;
 976         return (u32)lit;
 977 }
 978
 979 /* Constructs an LZX match from an offset and a length, and updates the LRU
 980  * queue and the frequency of symbols in the main, length, and aligned offset
 981  * alphabets.  The return value is a 32-bit number that provides the match in an
 982  * intermediate representation documented below.  */
 983 static u32
 984 lzx_tally_match(unsigned match_len, unsigned match_offset,
 985                 struct lzx_freqs *freqs, struct lzx_lru_queue *queue)
 986 {
 987         unsigned position_slot;
 988         unsigned position_footer;
 989         u32 len_header;
 990         unsigned main_symbol;
 991         unsigned len_footer;
 992         unsigned adjusted_match_len;
 993
 994         LZX_ASSERT(match_len >= LZX_MIN_MATCH_LEN && match_len <= LZX_MAX_MATCH_LEN);
 995
 996         /* The match offset shall be encoded as a position slot (itself encoded
 997          * as part of the main symbol) and a position footer.  */
 998         position_slot = lzx_get_position_slot(match_offset, queue);
 999         position_footer = (match_offset + LZX_OFFSET_OFFSET) &
1000                                 ((1U << lzx_get_num_extra_bits(position_slot)) - 1);
1001
1002         /* The match length shall be encoded as a length header (itself encoded
1003          * as part of the main symbol) and an optional length footer.  */
1004         adjusted_match_len = match_len - LZX_MIN_MATCH_LEN;
1005         if (adjusted_match_len < LZX_NUM_PRIMARY_LENS) {
1006                 /* No length footer needed.  */
1007                 len_header = adjusted_match_len;
1008         } else {
1009                 /* Length footer needed.  It will be encoded using the length
1010                  * code.  */
1011                 len_header = LZX_NUM_PRIMARY_LENS;
1012                 len_footer = adjusted_match_len - LZX_NUM_PRIMARY_LENS;
1013                 freqs->len[len_footer]++;
1014         }
1015
1016         /* Account for the main symbol.  */
1017         main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
1018
1019         freqs->main[main_symbol]++;
1020
1021         /* In an aligned offset block, 3 bits of the position footer are output
1022          * as an aligned offset symbol.  Account for this, although we may
1023          * ultimately decide to output the block as verbatim.  */
1024
1025         /* The following check is equivalent to:
1026          *
1027          * if (lzx_extra_bits[position_slot] >= 3)
1028          *
1029          * Note that this correctly excludes position slots that correspond to
1030          * recent offsets.  */
1031         if (position_slot >= 8)
1032                 freqs->aligned[position_footer & 7]++;
1033
1034         /* Pack the position slot, position footer, and match length into an
1035          * intermediate representation.  See `struct lzx_match' for details.
1036          */
1037         LZX_ASSERT(LZX_MAX_POSITION_SLOTS <= 64);
1038         LZX_ASSERT(lzx_get_num_extra_bits(LZX_MAX_POSITION_SLOTS - 1) <= 17);
1039         LZX_ASSERT(LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1 <= 256);
1040
1041         LZX_ASSERT(position_slot      <= (1U << (31 - 25)) - 1);
1042         LZX_ASSERT(position_footer    <= (1U << (25 -  8)) - 1);
1043         LZX_ASSERT(adjusted_match_len <= (1U << (8  -  0)) - 1);
1044         return 0x80000000 |
1045                 (position_slot << 25) |
1046                 (position_footer << 8) |
1047                 (adjusted_match_len);
1048 }
1049
1050 struct lzx_record_ctx {
1051         struct lzx_freqs freqs;
1052         struct lzx_lru_queue queue;
1053         struct lzx_match *matches;
1054 };
1055
1056 static void
1057 lzx_record_match(unsigned len, unsigned offset, void *_ctx)
1058 {
1059         struct lzx_record_ctx *ctx = _ctx;
1060
1061         (ctx->matches++)->data = lzx_tally_match(len, offset, &ctx->freqs, &ctx->queue);
1062 }
1063
1064 static void
1065 lzx_record_literal(u8 lit, void *_ctx)
1066 {
1067         struct lzx_record_ctx *ctx = _ctx;
1068
1069         (ctx->matches++)->data = lzx_tally_literal(lit, &ctx->freqs);
1070 }
1071
1072 /* Returns the cost, in bits, to output a literal byte using the specified cost
1073  * model.  */
1074 static unsigned
1075 lzx_literal_cost(u8 c, const struct lzx_costs * costs)
1076 {
1077         return costs->main[c];
1078 }
1079
1080 /* Given a (length, offset) pair that could be turned into a valid LZX match as
1081  * well as costs for the codewords in the main, length, and aligned Huffman
1082  * codes, return the approximate number of bits it will take to represent this
1083  * match in the compressed output.  Take into account the match offset LRU
1084  * queue and optionally update it.  */
1085 static unsigned
1086 lzx_match_cost(unsigned length, unsigned offset, const struct lzx_costs *costs,
1087                struct lzx_lru_queue *queue)
1088 {
1089         unsigned position_slot;
1090         unsigned len_header, main_symbol;
1091         unsigned cost = 0;
1092
1093         position_slot = lzx_get_position_slot(offset, queue);
1094
1095         len_header = min(length - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
1096         main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
1097
1098         /* Account for main symbol.  */
1099         cost += costs->main[main_symbol];
1100
1101         /* Account for extra position information.  */
1102         unsigned num_extra_bits = lzx_get_num_extra_bits(position_slot);
1103         if (num_extra_bits >= 3) {
1104                 cost += num_extra_bits - 3;
1105                 cost += costs->aligned[(offset + LZX_OFFSET_OFFSET) & 7];
1106         } else {
1107                 cost += num_extra_bits;
1108         }
1109
1110         /* Account for extra length information.  */
1111         if (len_header == LZX_NUM_PRIMARY_LENS)
1112                 cost += costs->len[length - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
1113
1114         return cost;
1115
1116 }
1117
1118 /* Fast heuristic cost evaluation to use in the inner loop of the match-finder.
1119  * Unlike lzx_match_cost() which does a true cost evaluation, this simply
1120  * prioritize matches based on their offset.  */
1121 static input_idx_t
1122 lzx_match_cost_fast(input_idx_t length, input_idx_t offset, const void *_queue)
1123 {
1124         const struct lzx_lru_queue *queue = _queue;
1125
1126         /* It seems well worth it to take the time to give priority to recently
1127          * used offsets.  */
1128         for (input_idx_t i = 0; i < LZX_NUM_RECENT_OFFSETS; i++)
1129                 if (offset == queue->R[i])
1130                         return i;
1131
1132         return offset;
1133 }
1134
1135 /* Set the cost model @ctx->costs from the Huffman codeword lengths specified in
1136  * @lens.
1137  *
1138  * The cost model and codeword lengths are almost the same thing, but the
1139  * Huffman codewords with length 0 correspond to symbols with zero frequency
1140  * that still need to be assigned actual costs.  The specific values assigned
1141  * are arbitrary, but they should be fairly high (near the maximum codeword
1142  * length) to take into account the fact that uses of these symbols are expected
1143  * to be rare.  */
1144 static void
1145 lzx_set_costs(struct lzx_compressor * ctx, const struct lzx_lens * lens)
1146 {
1147         unsigned i;
1148         unsigned num_main_syms = ctx->num_main_syms;
1149
1150         /* Main code  */
1151         for (i = 0; i < num_main_syms; i++) {
1152                 ctx->costs.main[i] = lens->main[i];
1153                 if (ctx->costs.main[i] == 0)
1154                         ctx->costs.main[i] = ctx->params.alg_params.slow.main_nostat_cost;
1155         }
1156
1157         /* Length code  */
1158         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) {
1159                 ctx->costs.len[i] = lens->len[i];
1160                 if (ctx->costs.len[i] == 0)
1161                         ctx->costs.len[i] = ctx->params.alg_params.slow.len_nostat_cost;
1162         }
1163
1164         /* Aligned offset code  */
1165         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1166                 ctx->costs.aligned[i] = lens->aligned[i];
1167                 if (ctx->costs.aligned[i] == 0)
1168                         ctx->costs.aligned[i] = ctx->params.alg_params.slow.aligned_nostat_cost;
1169         }
1170 }
1171
1172 /* Tell the match-finder to skip the specified number of bytes (@n) in the
1173  * input.  */
1174 static void
1175 lzx_lz_skip_bytes(struct lzx_compressor *ctx, unsigned n)
1176 {
1177         LZX_ASSERT(n <= ctx->match_window_end - ctx->match_window_pos);
1178         if (ctx->matches_cached) {
1179                 ctx->match_window_pos += n;
1180                 while (n--) {
1181                         ctx->cached_matches_pos +=
1182                                 ctx->cached_matches[ctx->cached_matches_pos].len + 1;
1183                 }
1184         } else {
1185                 while (n--) {
1186                         ctx->cached_matches[ctx->cached_matches_pos++].len = 0;
1187                         lz_sarray_skip_position(&ctx->lz_sarray);
1188                         ctx->match_window_pos++;
1189                 }
1190                 LZX_ASSERT(lz_sarray_get_pos(&ctx->lz_sarray) == ctx->match_window_pos);
1191         }
1192 }
1193
1194 /* Retrieve a list of matches available at the next position in the input.
1195  *
1196  * The matches are written to ctx->matches in decreasing order of length, and
1197  * the return value is the number of matches found.  */
1198 static unsigned
1199 lzx_lz_get_matches_caching(struct lzx_compressor *ctx,
1200                            const struct lzx_lru_queue *queue,
1201                            struct raw_match **matches_ret)
1202 {
1203         unsigned num_matches;
1204         struct raw_match *matches;
1205
1206         LZX_ASSERT(ctx->match_window_pos <= ctx->match_window_end);
1207
1208         matches = &ctx->cached_matches[ctx->cached_matches_pos + 1];
1209
1210         if (ctx->matches_cached) {
1211                 num_matches = matches[-1].len;
1212         } else {
1213                 LZX_ASSERT(lz_sarray_get_pos(&ctx->lz_sarray) == ctx->match_window_pos);
1214                 num_matches = lz_sarray_get_matches(&ctx->lz_sarray,
1215                                                     matches,
1216                                                     lzx_match_cost_fast,
1217                                                     queue);
1218                 matches[-1].len = num_matches;
1219         }
1220         ctx->cached_matches_pos += num_matches + 1;
1221         *matches_ret = matches;
1222
1223         /* Cap the length of returned matches to the number of bytes remaining,
1224          * if it is not the whole window.  */
1225         if (ctx->match_window_end < ctx->window_size) {
1226                 unsigned maxlen = ctx->match_window_end - ctx->match_window_pos;
1227                 for (unsigned i = 0; i < num_matches; i++)
1228                         if (matches[i].len > maxlen)
1229                                 matches[i].len = maxlen;
1230         }
1231 #if 0
1232         fprintf(stderr, "Pos %u/%u: %u matches\n",
1233                 ctx->match_window_pos, ctx->match_window_end, num_matches);
1234         for (unsigned i = 0; i < num_matches; i++)
1235                 fprintf(stderr, "\tLen %u Offset %u\n", matches[i].len, matches[i].offset);
1236 #endif
1237
1238 #ifdef ENABLE_LZX_DEBUG
1239         for (unsigned i = 0; i < num_matches; i++) {
1240                 LZX_ASSERT(matches[i].len >= LZX_MIN_MATCH_LEN);
1241                 LZX_ASSERT(matches[i].len <= LZX_MAX_MATCH_LEN);
1242                 LZX_ASSERT(matches[i].len <= ctx->match_window_end - ctx->match_window_pos);
1243                 LZX_ASSERT(matches[i].offset > 0);
1244                 LZX_ASSERT(matches[i].offset <= ctx->match_window_pos);
1245                 LZX_ASSERT(!memcmp(&ctx->window[ctx->match_window_pos],
1246                                    &ctx->window[ctx->match_window_pos - matches[i].offset],
1247                                    matches[i].len));
1248         }
1249 #endif
1250
1251         ctx->match_window_pos++;
1252         return num_matches;
1253 }
1254
1255 /*
1256  * Reverse the linked list of near-optimal matches so that they can be returned
1257  * in forwards order.
1258  *
1259  * Returns the first match in the list.
1260  */
1261 static struct raw_match
1262 lzx_lz_reverse_near_optimal_match_list(struct lzx_compressor *ctx,
1263                                        unsigned cur_pos)
1264 {
1265         unsigned prev_link, saved_prev_link;
1266         unsigned prev_match_offset, saved_prev_match_offset;
1267
1268         ctx->optimum_end_idx = cur_pos;
1269
1270         saved_prev_link = ctx->optimum[cur_pos].prev.link;
1271         saved_prev_match_offset = ctx->optimum[cur_pos].prev.match_offset;
1272
1273         do {
1274                 prev_link = saved_prev_link;
1275                 prev_match_offset = saved_prev_match_offset;
1276
1277                 saved_prev_link = ctx->optimum[prev_link].prev.link;
1278                 saved_prev_match_offset = ctx->optimum[prev_link].prev.match_offset;
1279
1280                 ctx->optimum[prev_link].next.link = cur_pos;
1281                 ctx->optimum[prev_link].next.match_offset = prev_match_offset;
1282
1283                 cur_pos = prev_link;
1284         } while (cur_pos != 0);
1285
1286         ctx->optimum_cur_idx = ctx->optimum[0].next.link;
1287
1288         return (struct raw_match)
1289                 { .len = ctx->optimum_cur_idx,
1290                   .offset = ctx->optimum[0].next.match_offset,
1291                 };
1292 }
1293
1294 /*
1295  * lzx_lz_get_near_optimal_match() -
1296  *
1297  * Choose the optimal match or literal to use at the next position in the input.
1298  *
1299  * Unlike a greedy parser that always takes the longest match, or even a
1300  * parser with one match/literal look-ahead like zlib, the algorithm used here
1301  * may look ahead many matches/literals to determine the optimal match/literal to
1302  * output next.  The motivation is that the compression ratio is improved if the
1303  * compressor can do things like use a shorter-than-possible match in order to
1304  * allow a longer match later, and also take into account the Huffman code cost
1305  * model rather than simply assuming that longer is better.
1306  *
1307  * Still, this is not truly an optimal parser because very long matches are
1308  * taken immediately, and the raw match-finder takes some shortcuts.  This is
1309  * done to avoid considering many different alternatives that are unlikely to
1310  * be significantly better.
1311  *
1312  * This algorithm is based on that used in 7-Zip's DEFLATE encoder.
1313  *
1314  * Each call to this function does one of two things:
1315  *
1316  * 1. Build a near-optimal sequence of matches/literals, up to some point, that
1317  *    will be returned by subsequent calls to this function, then return the
1318  *    first one.
1319  *
1320  * OR
1321  *
1322  * 2. Return the next match/literal previously computed by a call to this
1323  *    function;
1324  *
1325  * This function relies on the following state in the compressor context:
1326  *
1327  *      ctx->window          (read-only: preprocessed data being compressed)
1328  *      ctx->cost            (read-only: cost model to use)
1329  *      ctx->optimum         (internal state; leave uninitialized)
1330  *      ctx->optimum_cur_idx (must set to 0 before first call)
1331  *      ctx->optimum_end_idx (must set to 0 before first call)
1332  *
1333  *      Plus any state used by the raw match-finder.
1334  *
1335  * The return value is a (length, offset) pair specifying the match or literal
1336  * chosen.  For literals, the length is less than LZX_MIN_MATCH_LEN and the
1337  * offset is meaningless.
1338  */
1339 static struct raw_match
1340 lzx_lz_get_near_optimal_match(struct lzx_compressor * ctx)
1341 {
1342         unsigned num_possible_matches;
1343         struct raw_match *possible_matches;
1344         struct raw_match match;
1345         unsigned longest_match_len;
1346
1347         if (ctx->optimum_cur_idx != ctx->optimum_end_idx) {
1348                 /* Case 2: Return the next match/literal already found.  */
1349                 match.len = ctx->optimum[ctx->optimum_cur_idx].next.link -
1350                                     ctx->optimum_cur_idx;
1351                 match.offset = ctx->optimum[ctx->optimum_cur_idx].next.match_offset;
1352
1353                 ctx->optimum_cur_idx = ctx->optimum[ctx->optimum_cur_idx].next.link;
1354                 return match;
1355         }
1356
1357         /* Case 1:  Compute a new list of matches/literals to return.  */
1358
1359         ctx->optimum_cur_idx = 0;
1360         ctx->optimum_end_idx = 0;
1361
1362         /* Get matches at this position.  */
1363         num_possible_matches = lzx_lz_get_matches_caching(ctx, &ctx->queue, &possible_matches);
1364
1365         /* If no matches found, return literal.  */
1366         if (num_possible_matches == 0)
1367                 return (struct raw_match){ .len = 0 };
1368
1369         /* The matches that were found are sorted in decreasing order by length.
1370          * Get the length of the longest one.  */
1371         longest_match_len = possible_matches[0].len;
1372
1373         /* Greedy heuristic:  if the longest match that was found is greater
1374          * than the number of fast bytes, return it immediately; don't both
1375          * doing more work.  */
1376         if (longest_match_len > ctx->params.alg_params.slow.num_fast_bytes) {
1377                 lzx_lz_skip_bytes(ctx, longest_match_len - 1);
1378                 return possible_matches[0];
1379         }
1380
1381         /* Calculate the cost to reach the next position by outputting a
1382          * literal.  */
1383         ctx->optimum[0].queue = ctx->queue;
1384         ctx->optimum[1].queue = ctx->optimum[0].queue;
1385         ctx->optimum[1].cost = lzx_literal_cost(ctx->window[ctx->match_window_pos],
1386                                                 &ctx->costs);
1387         ctx->optimum[1].prev.link = 0;
1388
1389         /* Calculate the cost to reach any position up to and including that
1390          * reached by the longest match, using the shortest (i.e. closest) match
1391          * that reaches each position.  */
1392         BUILD_BUG_ON(LZX_MIN_MATCH_LEN != 2);
1393         for (unsigned len = LZX_MIN_MATCH_LEN, match_idx = num_possible_matches - 1;
1394              len <= longest_match_len; len++) {
1395
1396                 LZX_ASSERT(match_idx < num_possible_matches);
1397
1398                 ctx->optimum[len].queue = ctx->optimum[0].queue;
1399                 ctx->optimum[len].prev.link = 0;
1400                 ctx->optimum[len].prev.match_offset = possible_matches[match_idx].offset;
1401                 ctx->optimum[len].cost = lzx_match_cost(len,
1402                                                         possible_matches[match_idx].offset,
1403                                                         &ctx->costs,
1404                                                         &ctx->optimum[len].queue);
1405                 if (len == possible_matches[match_idx].len)
1406                         match_idx--;
1407         }
1408
1409         unsigned cur_pos = 0;
1410
1411         /* len_end: greatest index forward at which costs have been calculated
1412          * so far  */
1413         unsigned len_end = longest_match_len;
1414
1415         for (;;) {
1416                 /* Advance to next position.  */
1417                 cur_pos++;
1418
1419                 if (cur_pos == len_end || cur_pos == LZX_OPTIM_ARRAY_SIZE)
1420                         return lzx_lz_reverse_near_optimal_match_list(ctx, cur_pos);
1421
1422                 /* retrieve the number of matches available at this position  */
1423                 num_possible_matches = lzx_lz_get_matches_caching(ctx, &ctx->optimum[cur_pos].queue,
1424                                                                   &possible_matches);
1425
1426                 unsigned new_len = 0;
1427
1428                 if (num_possible_matches != 0) {
1429                         new_len = possible_matches[0].len;
1430
1431                         /* Greedy heuristic:  if we found a match greater than
1432                          * the number of fast bytes, stop immediately.  */
1433                         if (new_len > ctx->params.alg_params.slow.num_fast_bytes) {
1434
1435                                 /* Build the list of matches to return and get
1436                                  * the first one.  */
1437                                 match = lzx_lz_reverse_near_optimal_match_list(ctx, cur_pos);
1438
1439                                 /* Append the long match to the end of the list.  */
1440                                 ctx->optimum[cur_pos].next.match_offset =
1441                                         possible_matches[0].offset;
1442                                 ctx->optimum[cur_pos].next.link = cur_pos + new_len;
1443                                 ctx->optimum_end_idx = cur_pos + new_len;
1444
1445                                 /* Skip over the remaining bytes of the long match.  */
1446                                 lzx_lz_skip_bytes(ctx, new_len - 1);
1447
1448                                 /* Return first match in the list  */
1449                                 return match;
1450                         }
1451                 }
1452
1453                 /* Consider proceeding with a literal byte.  */
1454                 block_cost_t cur_cost = ctx->optimum[cur_pos].cost;
1455                 block_cost_t cur_plus_literal_cost = cur_cost +
1456                         lzx_literal_cost(ctx->window[ctx->match_window_pos - 1],
1457                                          &ctx->costs);
1458                 if (cur_plus_literal_cost < ctx->optimum[cur_pos + 1].cost) {
1459                         ctx->optimum[cur_pos + 1].cost = cur_plus_literal_cost;
1460                         ctx->optimum[cur_pos + 1].prev.link = cur_pos;
1461                         ctx->optimum[cur_pos + 1].queue = ctx->optimum[cur_pos].queue;
1462                 }
1463
1464                 if (num_possible_matches == 0)
1465                         continue;
1466
1467                 /* Consider proceeding with a match.  */
1468
1469                 while (len_end < cur_pos + new_len)
1470                         ctx->optimum[++len_end].cost = INFINITE_BLOCK_COST;
1471
1472                 for (unsigned len = LZX_MIN_MATCH_LEN, match_idx = num_possible_matches - 1;
1473                      len <= new_len; len++) {
1474                         LZX_ASSERT(match_idx < num_possible_matches);
1475                         struct lzx_lru_queue q = ctx->optimum[cur_pos].queue;
1476                         block_cost_t cost = cur_cost + lzx_match_cost(len,
1477                                                                       possible_matches[match_idx].offset,
1478                                                                       &ctx->costs,
1479                                                                       &q);
1480
1481                         if (cost < ctx->optimum[cur_pos + len].cost) {
1482                                 ctx->optimum[cur_pos + len].cost = cost;
1483                                 ctx->optimum[cur_pos + len].prev.link = cur_pos;
1484                                 ctx->optimum[cur_pos + len].prev.match_offset =
1485                                                 possible_matches[match_idx].offset;
1486                                 ctx->optimum[cur_pos + len].queue = q;
1487                         }
1488
1489                         if (len == possible_matches[match_idx].len)
1490                                 match_idx--;
1491                 }
1492         }
1493 }
1494
1495 /*
1496  * Set default symbol costs.
1497  */
1498 static void
1499 lzx_set_default_costs(struct lzx_costs * costs, unsigned num_main_syms)
1500 {
1501         unsigned i;
1502
1503         /* Literal symbols  */
1504         for (i = 0; i < LZX_NUM_CHARS; i++)
1505                 costs->main[i] = 8;
1506
1507         /* Match header symbols  */
1508         for (; i < num_main_syms; i++)
1509                 costs->main[i] = 10;
1510
1511         /* Length symbols  */
1512         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
1513                 costs->len[i] = 8;
1514
1515         /* Aligned offset symbols  */
1516         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
1517                 costs->aligned[i] = 3;
1518 }
1519
1520 /* Given the frequencies of symbols in a compressed block and the corresponding
1521  * Huffman codes, return LZX_BLOCKTYPE_ALIGNED or LZX_BLOCKTYPE_VERBATIM if an
1522  * aligned offset or verbatim block, respectively, will take fewer bits to
1523  * output.  */
1524 static int
1525 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
1526                                const struct lzx_codes * codes)
1527 {
1528         unsigned aligned_cost = 0;
1529         unsigned verbatim_cost = 0;
1530
1531         /* Verbatim blocks have a constant 3 bits per position footer.  Aligned
1532          * offset blocks have an aligned offset symbol per position footer, plus
1533          * an extra 24 bits to output the lengths necessary to reconstruct the
1534          * aligned offset code itself.  */
1535         for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1536                 verbatim_cost += 3 * freqs->aligned[i];
1537                 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
1538         }
1539         aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE * LZX_ALIGNEDCODE_NUM_SYMBOLS;
1540         if (aligned_cost < verbatim_cost)
1541                 return LZX_BLOCKTYPE_ALIGNED;
1542         else
1543                 return LZX_BLOCKTYPE_VERBATIM;
1544 }
1545
1546 /* Find a near-optimal sequence of matches/literals with which to output the
1547  * specified LZX block, then set its type to that which has the minimum cost to
1548  * output.  */
1549 static void
1550 lzx_optimize_block(struct lzx_compressor *ctx, struct lzx_block_spec *spec,
1551                    unsigned num_passes)
1552 {
1553         const struct lzx_lru_queue orig_queue = ctx->queue;
1554         struct lzx_freqs freqs;
1555
1556         unsigned orig_window_pos = spec->window_pos;
1557         unsigned orig_cached_pos = ctx->cached_matches_pos;
1558
1559         LZX_ASSERT(ctx->match_window_pos == spec->window_pos);
1560
1561         ctx->match_window_end = spec->window_pos + spec->block_size;
1562         spec->chosen_matches_start_pos = spec->window_pos;
1563
1564         LZX_ASSERT(num_passes >= 1);
1565
1566         /* The first optimal parsing pass is done using the cost model already
1567          * set in ctx->costs.  Each later pass is done using a cost model
1568          * computed from the previous pass.  */
1569         for (unsigned pass = 0; pass < num_passes; pass++) {
1570
1571                 ctx->match_window_pos = orig_window_pos;
1572                 ctx->cached_matches_pos = orig_cached_pos;
1573                 ctx->queue = orig_queue;
1574                 spec->num_chosen_matches = 0;
1575                 memset(&freqs, 0, sizeof(freqs));
1576
1577                 for (unsigned i = spec->window_pos; i < spec->window_pos + spec->block_size; ) {
1578                         struct raw_match raw_match;
1579                         struct lzx_match lzx_match;
1580
1581                         raw_match = lzx_lz_get_near_optimal_match(ctx);
1582                         if (raw_match.len >= LZX_MIN_MATCH_LEN) {
1583                                 lzx_match.data = lzx_tally_match(raw_match.len, raw_match.offset,
1584                                                                  &freqs, &ctx->queue);
1585                                 i += raw_match.len;
1586                         } else {
1587                                 lzx_match.data = lzx_tally_literal(ctx->window[i], &freqs);
1588                                 i += 1;
1589                         }
1590                         ctx->chosen_matches[spec->chosen_matches_start_pos +
1591                                             spec->num_chosen_matches++] = lzx_match;
1592                 }
1593
1594                 lzx_make_huffman_codes(&freqs, &spec->codes,
1595                                        ctx->num_main_syms);
1596                 if (pass < num_passes - 1)
1597                         lzx_set_costs(ctx, &spec->codes.lens);
1598                 ctx->matches_cached = true;
1599         }
1600         spec->block_type = lzx_choose_verbatim_or_aligned(&freqs, &spec->codes);
1601         ctx->matches_cached = false;
1602 }
1603
1604 static void
1605 lzx_optimize_blocks(struct lzx_compressor *ctx)
1606 {
1607         lzx_lru_queue_init(&ctx->queue);
1608         ctx->optimum_cur_idx = 0;
1609         ctx->optimum_end_idx = 0;
1610
1611         const unsigned num_passes = ctx->params.alg_params.slow.num_optim_passes;
1612
1613         for (unsigned i = 0; i < ctx->num_blocks; i++)
1614                 lzx_optimize_block(ctx, &ctx->block_specs[i], num_passes);
1615 }
1616
1617 /* Prepare the input window into one or more LZX blocks ready to be output.  */
1618 static void
1619 lzx_prepare_blocks(struct lzx_compressor * ctx)
1620 {
1621         /* Initialize the match-finder.  */
1622         lz_sarray_load_window(&ctx->lz_sarray, ctx->window, ctx->window_size);
1623         ctx->cached_matches_pos = 0;
1624         ctx->matches_cached = false;
1625         ctx->match_window_pos = 0;
1626
1627         /* Set up a default cost model.  */
1628         lzx_set_default_costs(&ctx->costs, ctx->num_main_syms);
1629
1630         ctx->num_blocks = DIV_ROUND_UP(ctx->window_size, LZX_DIV_BLOCK_SIZE);
1631         for (unsigned i = 0; i < ctx->num_blocks; i++) {
1632                 unsigned pos = LZX_DIV_BLOCK_SIZE * i;
1633                 ctx->block_specs[i].window_pos = pos;
1634                 ctx->block_specs[i].block_size = min(ctx->window_size - pos, LZX_DIV_BLOCK_SIZE);
1635         }
1636
1637         /* Determine sequence of matches/literals to output for each block.  */
1638         lzx_optimize_blocks(ctx);
1639 }
1640
1641 /*
1642  * This is the fast version of lzx_prepare_blocks().  This version "quickly"
1643  * prepares a single compressed block containing the entire input.  See the
1644  * description of the "Fast algorithm" at the beginning of this file for more
1645  * information.
1646  *
1647  * Input ---  the preprocessed data:
1648  *
1649  *      ctx->window[]
1650  *      ctx->window_size
1651  *
1652  * Output --- the block specification and the corresponding match/literal data:
1653  *
1654  *      ctx->block_specs[]
1655  *      ctx->num_blocks
1656  *      ctx->chosen_matches[]
1657  */
1658 static void
1659 lzx_prepare_block_fast(struct lzx_compressor * ctx)
1660 {
1661         struct lzx_record_ctx record_ctx;
1662         struct lzx_block_spec *spec;
1663
1664         /* Parameters to hash chain LZ match finder
1665          * (lazy with 1 match lookahead)  */
1666         static const struct lz_params lzx_lz_params = {
1667                 /* Although LZX_MIN_MATCH_LEN == 2, length 2 matches typically
1668                  * aren't worth choosing when using greedy or lazy parsing.  */
1669                 .min_match      = 3,
1670                 .max_match      = LZX_MAX_MATCH_LEN,
1671                 .max_offset     = LZX_MAX_WINDOW_SIZE,
1672                 .good_match     = LZX_MAX_MATCH_LEN,
1673                 .nice_match     = LZX_MAX_MATCH_LEN,
1674                 .max_chain_len  = LZX_MAX_MATCH_LEN,
1675                 .max_lazy_match = LZX_MAX_MATCH_LEN,
1676                 .too_far        = 4096,
1677         };
1678
1679         /* Initialize symbol frequencies and match offset LRU queue.  */
1680         memset(&record_ctx.freqs, 0, sizeof(struct lzx_freqs));
1681         lzx_lru_queue_init(&record_ctx.queue);
1682         record_ctx.matches = ctx->chosen_matches;
1683
1684         /* Determine series of matches/literals to output.  */
1685         lz_analyze_block(ctx->window,
1686                          ctx->window_size,
1687                          lzx_record_match,
1688                          lzx_record_literal,
1689                          &record_ctx,
1690                          &lzx_lz_params,
1691                          ctx->prev_tab);
1692
1693         /* Set up block specification.  */
1694         spec = &ctx->block_specs[0];
1695         spec->block_type = LZX_BLOCKTYPE_ALIGNED;
1696         spec->window_pos = 0;
1697         spec->block_size = ctx->window_size;
1698         spec->num_chosen_matches = (record_ctx.matches - ctx->chosen_matches);
1699         spec->chosen_matches_start_pos = 0;
1700         lzx_make_huffman_codes(&record_ctx.freqs, &spec->codes,
1701                                ctx->num_main_syms);
1702         ctx->num_blocks = 1;
1703 }
1704
1705 static void
1706 do_call_insn_translation(u32 *call_insn_target, int input_pos,
1707                          s32 file_size)
1708 {
1709         s32 abs_offset;
1710         s32 rel_offset;
1711
1712         rel_offset = le32_to_cpu(*call_insn_target);
1713         if (rel_offset >= -input_pos && rel_offset < file_size) {
1714                 if (rel_offset < file_size - input_pos) {
1715                         /* "good translation" */
1716                         abs_offset = rel_offset + input_pos;
1717                 } else {
1718                         /* "compensating translation" */
1719                         abs_offset = rel_offset - file_size;
1720                 }
1721                 *call_insn_target = cpu_to_le32(abs_offset);
1722         }
1723 }
1724
1725 /* This is the reverse of undo_call_insn_preprocessing() in lzx-decompress.c.
1726  * See the comment above that function for more information.  */
1727 static void
1728 do_call_insn_preprocessing(u8 data[], int size)
1729 {
1730         for (int i = 0; i < size - 10; i++) {
1731                 if (data[i] == 0xe8) {
1732                         do_call_insn_translation((u32*)&data[i + 1], i,
1733                                                  LZX_WIM_MAGIC_FILESIZE);
1734                         i += 4;
1735                 }
1736         }
1737 }
1738
1739 static size_t
1740 lzx_compress(const void *uncompressed_data, size_t uncompressed_size,
1741              void *compressed_data, size_t compressed_size_avail, void *_ctx)
1742 {
1743         struct lzx_compressor *ctx = _ctx;
1744         struct output_bitstream ostream;
1745         size_t compressed_size;
1746
1747         if (uncompressed_size < 100) {
1748                 LZX_DEBUG("Too small to bother compressing.");
1749                 return 0;
1750         }
1751
1752         if (uncompressed_size > ctx->max_window_size) {
1753                 LZX_DEBUG("Can't compress %zu bytes using window of %u bytes!",
1754                           uncompressed_size, ctx->max_window_size);
1755                 return 0;
1756         }
1757
1758         LZX_DEBUG("Attempting to compress %zu bytes...",
1759                   uncompressed_size);
1760
1761         /* The input data must be preprocessed.  To avoid changing the original
1762          * input, copy it to a temporary buffer.  */
1763         memcpy(ctx->window, uncompressed_data, uncompressed_size);
1764         ctx->window_size = uncompressed_size;
1765
1766         /* This line is unnecessary; it just avoids inconsequential accesses of
1767          * uninitialized memory that would show up in memory-checking tools such
1768          * as valgrind.  */
1769         memset(&ctx->window[ctx->window_size], 0, 12);
1770
1771         LZX_DEBUG("Preprocessing data...");
1772
1773         /* Before doing any actual compression, do the call instruction (0xe8
1774          * byte) translation on the uncompressed data.  */
1775         do_call_insn_preprocessing(ctx->window, ctx->window_size);
1776
1777         LZX_DEBUG("Preparing blocks...");
1778
1779         /* Prepare the compressed data.  */
1780         if (ctx->params.algorithm == WIMLIB_LZX_ALGORITHM_FAST)
1781                 lzx_prepare_block_fast(ctx);
1782         else
1783                 lzx_prepare_blocks(ctx);
1784
1785         LZX_DEBUG("Writing compressed blocks...");
1786
1787         /* Generate the compressed data.  */
1788         init_output_bitstream(&ostream, compressed_data, compressed_size_avail);
1789         lzx_write_all_blocks(ctx, &ostream);
1790
1791         LZX_DEBUG("Flushing bitstream...");
1792         compressed_size = flush_output_bitstream(&ostream);
1793         if (compressed_size == ~(input_idx_t)0) {
1794                 LZX_DEBUG("Data did not compress to %zu bytes or less!",
1795                           compressed_size_avail);
1796                 return 0;
1797         }
1798
1799         LZX_DEBUG("Done: compressed %zu => %zu bytes.",
1800                   uncompressed_size, compressed_size);
1801
1802         /* Verify that we really get the same thing back when decompressing.
1803          * Although this could be disabled by default in all cases, it only
1804          * takes around 2-3% of the running time of the slow algorithm to do the
1805          * verification.  */
1806         if (ctx->params.algorithm == WIMLIB_LZX_ALGORITHM_SLOW
1807         #if defined(ENABLE_LZX_DEBUG) || defined(ENABLE_VERIFY_COMPRESSION)
1808             || 1
1809         #endif
1810             )
1811         {
1812                 struct wimlib_decompressor *decompressor;
1813
1814                 if (0 == wimlib_create_decompressor(WIMLIB_COMPRESSION_TYPE_LZX,
1815                                                     ctx->max_window_size,
1816                                                     NULL,
1817                                                     &decompressor))
1818                 {
1819                         int ret;
1820                         ret = wimlib_decompress(compressed_data,
1821                                                 compressed_size,
1822                                                 ctx->window,
1823                                                 uncompressed_size,
1824                                                 decompressor);
1825                         wimlib_free_decompressor(decompressor);
1826
1827                         if (ret) {
1828                                 ERROR("Failed to decompress data we "
1829                                       "compressed using LZX algorithm");
1830                                 wimlib_assert(0);
1831                                 return 0;
1832                         }
1833                         if (memcmp(uncompressed_data, ctx->window, uncompressed_size)) {
1834                                 ERROR("Data we compressed using LZX algorithm "
1835                                       "didn't decompress to original");
1836                                 wimlib_assert(0);
1837                                 return 0;
1838                         }
1839                 } else {
1840                         WARNING("Failed to create decompressor for "
1841                                 "data verification!");
1842                 }
1843         }
1844         return compressed_size;
1845 }
1846
1847 static bool
1848 lzx_params_valid(const struct wimlib_lzx_compressor_params *params)
1849 {
1850         /* Validate parameters.  */
1851         if (params->hdr.size != sizeof(struct wimlib_lzx_compressor_params)) {
1852                 LZX_DEBUG("Invalid parameter structure size!");
1853                 return false;
1854         }
1855
1856         if (params->algorithm != WIMLIB_LZX_ALGORITHM_SLOW &&
1857             params->algorithm != WIMLIB_LZX_ALGORITHM_FAST)
1858         {
1859                 LZX_DEBUG("Invalid algorithm.");
1860                 return false;
1861         }
1862
1863         if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) {
1864                 if (params->alg_params.slow.num_optim_passes < 1)
1865                 {
1866                         LZX_DEBUG("Invalid number of optimization passes!");
1867                         return false;
1868                 }
1869
1870                 if (params->alg_params.slow.main_nostat_cost < 1 ||
1871                     params->alg_params.slow.main_nostat_cost > 16)
1872                 {
1873                         LZX_DEBUG("Invalid main_nostat_cost!");
1874                         return false;
1875                 }
1876
1877                 if (params->alg_params.slow.len_nostat_cost < 1 ||
1878                     params->alg_params.slow.len_nostat_cost > 16)
1879                 {
1880                         LZX_DEBUG("Invalid len_nostat_cost!");
1881                         return false;
1882                 }
1883
1884                 if (params->alg_params.slow.aligned_nostat_cost < 1 ||
1885                     params->alg_params.slow.aligned_nostat_cost > 8)
1886                 {
1887                         LZX_DEBUG("Invalid aligned_nostat_cost!");
1888                         return false;
1889                 }
1890         }
1891         return true;
1892 }
1893
1894 static void
1895 lzx_free_compressor(void *_ctx)
1896 {
1897         struct lzx_compressor *ctx = _ctx;
1898
1899         if (ctx) {
1900                 FREE(ctx->chosen_matches);
1901                 FREE(ctx->cached_matches);
1902                 FREE(ctx->optimum);
1903                 lz_sarray_destroy(&ctx->lz_sarray);
1904                 FREE(ctx->block_specs);
1905                 FREE(ctx->prev_tab);
1906                 FREE(ctx->window);
1907                 FREE(ctx);
1908         }
1909 }
1910
1911 static int
1912 lzx_create_compressor(size_t window_size,
1913                       const struct wimlib_compressor_params_header *_params,
1914                       void **ctx_ret)
1915 {
1916         const struct wimlib_lzx_compressor_params *params =
1917                 (const struct wimlib_lzx_compressor_params*)_params;
1918         struct lzx_compressor *ctx;
1919
1920         LZX_DEBUG("Allocating LZX context...");
1921
1922         if (!lzx_window_size_valid(window_size))
1923                 return WIMLIB_ERR_INVALID_PARAM;
1924
1925         static const struct wimlib_lzx_compressor_params fast_default = {
1926                 .hdr = {
1927                         .size = sizeof(struct wimlib_lzx_compressor_params),
1928                 },
1929                 .algorithm = WIMLIB_LZX_ALGORITHM_FAST,
1930                 .use_defaults = 0,
1931                 .alg_params = {
1932                         .fast = {
1933                         },
1934                 },
1935         };
1936         static const struct wimlib_lzx_compressor_params slow_default = {
1937                 .hdr = {
1938                         .size = sizeof(struct wimlib_lzx_compressor_params),
1939                 },
1940                 .algorithm = WIMLIB_LZX_ALGORITHM_SLOW,
1941                 .use_defaults = 0,
1942                 .alg_params = {
1943                         .slow = {
1944                                 .use_len2_matches = 1,
1945                                 .num_fast_bytes = 32,
1946                                 .num_optim_passes = 2,
1947                                 .max_search_depth = 50,
1948                                 .max_matches_per_pos = 3,
1949                                 .main_nostat_cost = 15,
1950                                 .len_nostat_cost = 15,
1951                                 .aligned_nostat_cost = 7,
1952                         },
1953                 },
1954         };
1955
1956         if (params) {
1957                 if (!lzx_params_valid(params))
1958                         return WIMLIB_ERR_INVALID_PARAM;
1959         } else {
1960                 LZX_DEBUG("Using default algorithm and parameters.");
1961                 params = &slow_default;
1962         }
1963
1964         if (params->use_defaults) {
1965                 if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW)
1966                         params = &slow_default;
1967                 else
1968                         params = &fast_default;
1969         }
1970
1971         LZX_DEBUG("Allocating memory.");
1972
1973         ctx = CALLOC(1, sizeof(struct lzx_compressor));
1974         if (ctx == NULL)
1975                 goto oom;
1976
1977         ctx->num_main_syms = lzx_get_num_main_syms(window_size);
1978         ctx->max_window_size = window_size;
1979         ctx->window = MALLOC(window_size + 12);
1980         if (ctx->window == NULL)
1981                 goto oom;
1982
1983         if (params->algorithm == WIMLIB_LZX_ALGORITHM_FAST) {
1984                 ctx->prev_tab = MALLOC(window_size * sizeof(ctx->prev_tab[0]));
1985                 if (ctx->prev_tab == NULL)
1986                         goto oom;
1987         }
1988
1989         size_t block_specs_length = DIV_ROUND_UP(window_size, LZX_DIV_BLOCK_SIZE);
1990         ctx->block_specs = MALLOC(block_specs_length * sizeof(ctx->block_specs[0]));
1991         if (ctx->block_specs == NULL)
1992                 goto oom;
1993
1994         if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) {
1995                 unsigned min_match_len = LZX_MIN_MATCH_LEN;
1996                 if (!params->alg_params.slow.use_len2_matches)
1997                         min_match_len = max(min_match_len, 3);
1998
1999                 if (!lz_sarray_init(&ctx->lz_sarray,
2000                                     window_size,
2001                                     min_match_len,
2002                                     LZX_MAX_MATCH_LEN,
2003                                     params->alg_params.slow.max_search_depth,
2004                                     params->alg_params.slow.max_matches_per_pos))
2005                         goto oom;
2006         }
2007
2008         if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) {
2009                 ctx->optimum = MALLOC((LZX_OPTIM_ARRAY_SIZE + LZX_MAX_MATCH_LEN) *
2010                                        sizeof(ctx->optimum[0]));
2011                 if (ctx->optimum == NULL)
2012                         goto oom;
2013         }
2014
2015         if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) {
2016                 u32 cache_per_pos;
2017
2018                 cache_per_pos = params->alg_params.slow.max_matches_per_pos;
2019                 if (cache_per_pos > LZX_MAX_CACHE_PER_POS)
2020                         cache_per_pos = LZX_MAX_CACHE_PER_POS;
2021
2022                 ctx->cached_matches = MALLOC(window_size * (cache_per_pos + 1) *
2023                                              sizeof(ctx->cached_matches[0]));
2024                 if (ctx->cached_matches == NULL)
2025                         goto oom;
2026         }
2027
2028         ctx->chosen_matches = MALLOC(window_size * sizeof(ctx->chosen_matches[0]));
2029         if (ctx->chosen_matches == NULL)
2030                 goto oom;
2031
2032         memcpy(&ctx->params, params, sizeof(struct wimlib_lzx_compressor_params));
2033         memset(&ctx->zero_codes, 0, sizeof(ctx->zero_codes));
2034
2035         LZX_DEBUG("Successfully allocated new LZX context.");
2036
2037         *ctx_ret = ctx;
2038         return 0;
2039
2040 oom:
2041         lzx_free_compressor(ctx);
2042         return WIMLIB_ERR_NOMEM;
2043 }
2044
2045 const struct compressor_ops lzx_compressor_ops = {
2046         .create_compressor  = lzx_create_compressor,
2047         .compress           = lzx_compress,
2048         .free_compressor    = lzx_free_compressor,
2049 };