wimlib.net Git - wimlib/blob - src/compress_common.c

   1 /*
   2  * compress_common.c
   3  *
   4  * Code for compression shared among multiple compression formats.
   5  *
   6  * Author:  Eric Biggers
   7  * Year:    2012 - 2014
   8  *
   9  * The author dedicates this file to the public domain.
  10  * You can do whatever you want with this file.
  11  */
  12
  13 #ifdef HAVE_CONFIG_H
  14 #  include "config.h"
  15 #endif
  16
  17 #include "wimlib/assert.h"
  18 #include "wimlib/endianness.h"
  19 #include "wimlib/compiler.h"
  20 #include "wimlib/compress_common.h"
  21 #include "wimlib/util.h"
  22
  23 #include <string.h>
  24
  25 /* Writes @num_bits bits, given by the @num_bits least significant bits of
  26  * @bits, to the output @ostream. */
  27 void
  28 bitstream_put_bits(struct output_bitstream *ostream, u32 bits,
  29                    unsigned num_bits)
  30 {
  31         bits &= (1U << num_bits) - 1;
  32         while (num_bits > ostream->free_bits) {
  33                 /* Buffer variable does not have space for the new bits.  It
  34                  * needs to be flushed as a 16-bit integer.  Bits in the second
  35                  * byte logically precede those in the first byte
  36                  * (little-endian), but within each byte the bits are ordered
  37                  * from high to low.  This is true for both XPRESS and LZX
  38                  * compression.  */
  39
  40                 /* There must be at least 2 bytes of space remaining.  */
  41                 if (unlikely(ostream->bytes_remaining < 2)) {
  42                         ostream->overrun = true;
  43                         return;
  44                 }
  45
  46                 /* Fill the buffer with as many bits that fit.  */
  47                 unsigned fill_bits = ostream->free_bits;
  48
  49                 ostream->bitbuf <<= fill_bits;
  50                 ostream->bitbuf |= bits >> (num_bits - fill_bits);
  51
  52                 *(le16*)ostream->bit_output = cpu_to_le16(ostream->bitbuf);
  53                 ostream->bit_output = ostream->next_bit_output;
  54                 ostream->next_bit_output = ostream->output;
  55                 ostream->output += 2;
  56                 ostream->bytes_remaining -= 2;
  57
  58                 ostream->free_bits = 16;
  59                 num_bits -= fill_bits;
  60                 bits &= (1U << num_bits) - 1;
  61         }
  62
  63         /* Buffer variable has space for the new bits.  */
  64         ostream->bitbuf = (ostream->bitbuf << num_bits) | bits;
  65         ostream->free_bits -= num_bits;
  66 }
  67
  68 void
  69 bitstream_put_byte(struct output_bitstream *ostream, u8 n)
  70 {
  71         if (unlikely(ostream->bytes_remaining < 1)) {
  72                 ostream->overrun = true;
  73                 return;
  74         }
  75         *ostream->output++ = n;
  76         ostream->bytes_remaining--;
  77 }
  78
  79 /* Flushes any remaining bits to the output bitstream.
  80  *
  81  * Returns -1 if the stream has overrun; otherwise returns the total number of
  82  * bytes in the output.  */
  83 input_idx_t
  84 flush_output_bitstream(struct output_bitstream *ostream)
  85 {
  86         if (unlikely(ostream->overrun))
  87                 return ~(input_idx_t)0;
  88
  89         *(le16*)ostream->bit_output =
  90                 cpu_to_le16((u16)((u32)ostream->bitbuf << ostream->free_bits));
  91         *(le16*)ostream->next_bit_output =
  92                 cpu_to_le16(0);
  93
  94         return ostream->output - ostream->output_start;
  95 }
  96
  97 /* Initializes an output bit buffer to write its output to the memory location
  98  * pointer to by @data. */
  99 void
 100 init_output_bitstream(struct output_bitstream *ostream,
 101                       void *data, unsigned num_bytes)
 102 {
 103         wimlib_assert(num_bytes >= 4);
 104
 105         ostream->bitbuf              = 0;
 106         ostream->free_bits           = 16;
 107         ostream->output_start        = data;
 108         ostream->bit_output          = data;
 109         ostream->next_bit_output     = data + 2;
 110         ostream->output              = data + 4;
 111         ostream->bytes_remaining     = num_bytes - 4;
 112         ostream->overrun             = false;
 113 }
 114
 115 /* Given the binary tree node A[subtree_idx] whose children already
 116  * satisfy the maxheap property, swap the node with its greater child
 117  * until it is greater than both its children, so that the maxheap
 118  * property is satisfied in the subtree rooted at A[subtree_idx].  */
 119 static void
 120 heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
 121 {
 122         unsigned parent_idx;
 123         unsigned child_idx;
 124         u32 v;
 125
 126         v = A[subtree_idx];
 127         parent_idx = subtree_idx;
 128         while ((child_idx = parent_idx * 2) <= length) {
 129                 if (child_idx < length && A[child_idx + 1] > A[child_idx])
 130                         child_idx++;
 131                 if (v >= A[child_idx])
 132                         break;
 133                 A[parent_idx] = A[child_idx];
 134                 parent_idx = child_idx;
 135         }
 136         A[parent_idx] = v;
 137 }
 138
 139 /* Rearrange the array 'A' so that it satisfies the maxheap property.
 140  * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
 141  */
 142 static void
 143 heapify_array(u32 A[], unsigned length)
 144 {
 145         for (unsigned subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
 146                 heapify_subtree(A, length, subtree_idx);
 147 }
 148
 149 /* Sort the array 'A', which contains 'length' unsigned 32-bit integers.  */
 150 static void
 151 heapsort(u32 A[], unsigned length)
 152 {
 153         A--; /* Use 1-based indices  */
 154
 155         heapify_array(A, length);
 156
 157         while (length >= 2) {
 158                 swap(A[1], A[length]);
 159                 length--;
 160                 heapify_subtree(A, length, 1);
 161         }
 162 }
 163
 164 #define NUM_SYMBOL_BITS 10
 165 #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
 166
 167 /*
 168  * Sort the symbols primarily by frequency and secondarily by symbol
 169  * value.  Discard symbols with zero frequency and fill in an array with
 170  * the remaining symbols, along with their frequencies.  The low
 171  * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
 172  * value, and the remaining bits will contain the frequency.
 173  *
 174  * @num_syms
 175  *      Number of symbols in the alphabet.
 176  *      Can't be greater than (1 << NUM_SYMBOL_BITS).
 177  *
 178  * @freqs[num_syms]
 179  *      The frequency of each symbol.
 180  *
 181  * @lens[num_syms]
 182  *      An array that eventually will hold the length of each codeword.
 183  *      This function only fills in the codeword lengths for symbols that
 184  *      have zero frequency, which are not well defined per se but will
 185  *      be set to 0.
 186  *
 187  * @symout[num_syms]
 188  *      The output array, described above.
 189  *
 190  * Returns the number of entries in 'symout' that were filled.  This is
 191  * the number of symbols that have nonzero frequency.
 192  */
 193 static unsigned
 194 sort_symbols(unsigned num_syms, const u32 freqs[restrict],
 195              u8 lens[restrict], u32 symout[restrict])
 196 {
 197         unsigned num_used_syms;
 198         unsigned num_counters;
 199
 200         /* We rely on heapsort, but with an added optimization.  Since
 201          * it's common for most symbol frequencies to be low, we first do
 202          * a count sort using a limited number of counters.  High
 203          * frequencies will be counted in the last counter, and only they
 204          * will be sorted with heapsort.
 205          *
 206          * Note: with more symbols, it is generally beneficial to have more
 207          * counters.  About 1 counter per 4 symbols seems fast.
 208          *
 209          * Note: I also tested radix sort, but even for large symbol
 210          * counts (> 255) and frequencies bounded at 16 bits (enabling
 211          * radix sort by just two base-256 digits), it didn't seem any
 212          * faster than the method implemented here.
 213          *
 214          * Note: I tested the optimized quicksort implementation from
 215          * glibc (with indirection overhead removed), but it was only
 216          * marginally faster than the simple heapsort implemented here.
 217          *
 218          * Tests were done with building the codes for LZX.  Results may
 219          * vary for different compression algorithms...!  */
 220
 221         num_counters = (DIV_ROUND_UP(num_syms, 4) + 3) & ~3;
 222
 223         unsigned counters[num_counters];
 224
 225         memset(counters, 0, sizeof(counters));
 226
 227         /* Count the frequencies.  */
 228         for (unsigned sym = 0; sym < num_syms; sym++)
 229                 counters[min(freqs[sym], num_counters - 1)]++;
 230
 231         /* Make the counters cumulative, ignoring the zero-th, which
 232          * counted symbols with zero frequency.  As a side effect, this
 233          * calculates the number of symbols with nonzero frequency.  */
 234         num_used_syms = 0;
 235         for (unsigned i = 1; i < num_counters; i++) {
 236                 unsigned count = counters[i];
 237                 counters[i] = num_used_syms;
 238                 num_used_syms += count;
 239         }
 240
 241         /* Sort nonzero-frequency symbols using the counters.  At the
 242          * same time, set the codeword lengths of zero-frequency symbols
 243          * to 0.  */
 244         for (unsigned sym = 0; sym < num_syms; sym++) {
 245                 u32 freq = freqs[sym];
 246                 if (freq != 0) {
 247                         symout[counters[min(freq, num_counters - 1)]++] =
 248                                 sym | (freq << NUM_SYMBOL_BITS);
 249                 } else {
 250                         lens[sym] = 0;
 251                 }
 252         }
 253
 254         /* Sort the symbols counted in the last counter.  */
 255         heapsort(symout + counters[num_counters - 2],
 256                  counters[num_counters - 1] - counters[num_counters - 2]);
 257
 258         return num_used_syms;
 259 }
 260
 261 /*
 262  * Build the Huffman tree.
 263  *
 264  * This is an optimized implementation that
 265  *      (a) takes advantage of the frequencies being already sorted;
 266  *      (b) only generates non-leaf nodes, since the non-leaf nodes of a
 267  *          Huffman tree are sufficient to generate a canonical code;
 268  *      (c) Only stores parent pointers, not child pointers;
 269  *      (d) Produces the nodes in the same memory used for input
 270  *          frequency information.
 271  *
 272  * Array 'A', which contains 'sym_count' entries, is used for both input
 273  * and output.  For this function, 'sym_count' must be at least 2.
 274  *
 275  * For input, the array must contain the frequencies of the symbols,
 276  * sorted in increasing order.  Specifically, each entry must contain a
 277  * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
 278  * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
 279  * Although these bits will, in fact, contain the symbols that correspond
 280  * to the frequencies, this function is concerned with frequencies only
 281  * and keeps the symbols as-is.
 282  *
 283  * For output, this function will produce the non-leaf nodes of the
 284  * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
 285  * entries of the array.  Entry A[sym_count - 2] will represent the root
 286  * node.  Each other node will contain the zero-based index of its parent
 287  * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
 288  * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
 289  * note that although these low bits will, in fact, contain a symbol
 290  * value, this symbol will have *no relationship* with the Huffman tree
 291  * node that happens to occupy the same slot.  This is because this
 292  * implementation only generates the non-leaf nodes of the tree.
 293  */
 294 static void
 295 build_tree(u32 A[], unsigned sym_count)
 296 {
 297         /* Index, in 'A', of next lowest frequency symbol that has not
 298          * yet been processed.  */
 299         unsigned i = 0;
 300
 301         /* Index, in 'A', of next lowest frequency parentless non-leaf
 302          * node; or, if equal to 'e', then no such node exists yet.  */
 303         unsigned b = 0;
 304
 305         /* Index, in 'A', of next node to allocate as a non-leaf.  */
 306         unsigned e = 0;
 307
 308         do {
 309                 unsigned m, n;
 310                 u32 freq_shifted;
 311
 312                 /* Choose the two next lowest frequency entries.  */
 313
 314                 if (i != sym_count &&
 315                     (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
 316                         m = i++;
 317                 else
 318                         m = b++;
 319
 320                 if (i != sym_count &&
 321                     (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
 322                         n = i++;
 323                 else
 324                         n = b++;
 325
 326                 /* Allocate a non-leaf node and link the entries to it.
 327                  *
 328                  * If we link an entry that we're visiting for the first
 329                  * time (via index 'i'), then we're actually linking a
 330                  * leaf node and it will have no effect, since the leaf
 331                  * will be overwritten with a non-leaf when index 'e'
 332                  * catches up to it.  But it's not any slower to
 333                  * unconditionally set the parent index.
 334                  *
 335                  * We also compute the frequency of the non-leaf node as
 336                  * the sum of its two children's frequencies.  */
 337
 338                 freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
 339
 340                 A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
 341                 A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
 342                 A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
 343                 e++;
 344         } while (sym_count - e > 1);
 345                 /* When just one entry remains, it is a "leaf" that was
 346                  * linked to some other node.  We ignore it, since the
 347                  * rest of the array contains the non-leaves which we
 348                  * need.  (Note that we're assuming the cases with 0 or 1
 349                  * symbols were handled separately.) */
 350 }
 351
 352 /*
 353  * Given the stripped-down Huffman tree constructed by build_tree(),
 354  * determine the number of codewords that should be assigned each
 355  * possible length, taking into account the length-limited constraint.
 356  *
 357  * @A
 358  *      The array produced by build_tree(), containing parent index
 359  *      information for the non-leaf nodes of the Huffman tree.  Each
 360  *      entry in this array is a node; a node's parent always has a
 361  *      greater index than that node itself.  This function will
 362  *      overwrite the parent index information in this array, so
 363  *      essentially it will destroy the tree.  However, the data in the
 364  *      low NUM_SYMBOL_BITS of each entry will be preserved.
 365  *
 366  * @root_idx
 367  *      The 0-based index of the root node in 'A', and consequently one
 368  *      less than the number of tree node entries in 'A'.  (Or, really 2
 369  *      less than the actual length of 'A'.)
 370  *
 371  * @len_counts
 372  *      An array of length ('max_codeword_len' + 1) in which the number of
 373  *      codewords having each length <= max_codeword_len will be
 374  *      returned.
 375  *
 376  * @max_codeword_len
 377  *      The maximum permissible codeword length.
 378  */
 379 static void
 380 compute_length_counts(u32 A[restrict], unsigned root_idx,
 381                       unsigned len_counts[restrict], unsigned max_codeword_len)
 382 {
 383         /* The key observations are:
 384          *
 385          * (1) We can traverse the non-leaf nodes of the tree, always
 386          * visiting a parent before its children, by simply iterating
 387          * through the array in reverse order.  Consequently, we can
 388          * compute the depth of each node in one pass, overwriting the
 389          * parent indices with depths.
 390          *
 391          * (2) We can initially assume that in the real Huffman tree,
 392          * both children of the root are leaves.  This corresponds to two
 393          * codewords of length 1.  Then, whenever we visit a (non-leaf)
 394          * node during the traversal, we modify this assumption to
 395          * account for the current node *not* being a leaf, but rather
 396          * its two children being leaves.  This causes the loss of one
 397          * codeword for the current depth and the addition of two
 398          * codewords for the current depth plus one.
 399          *
 400          * (3) We can handle the length-limited constraint fairly easily
 401          * by simply using the largest length available when a depth
 402          * exceeds max_codeword_len.
 403          */
 404
 405         for (unsigned len = 0; len <= max_codeword_len; len++)
 406                 len_counts[len] = 0;
 407         len_counts[1] = 2;
 408
 409         /* Set the root node's depth to 0.  */
 410         A[root_idx] &= SYMBOL_MASK;
 411
 412         for (int node = root_idx - 1; node >= 0; node--) {
 413
 414                 /* Calculate the depth of this node.  */
 415
 416                 unsigned parent = A[node] >> NUM_SYMBOL_BITS;
 417                 unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
 418                 unsigned depth = parent_depth + 1;
 419                 unsigned len = depth;
 420
 421                 /* Set the depth of this node so that it is available
 422                  * when its children (if any) are processed.  */
 423
 424                 A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
 425
 426                 /* If needed, decrease the length to meet the
 427                  * length-limited constraint.  This is not the optimal
 428                  * method for generating length-limited Huffman codes!
 429                  * But it should be good enough.  */
 430                 if (len >= max_codeword_len) {
 431                         len = max_codeword_len;
 432                         do {
 433                                 len--;
 434                         } while (len_counts[len] == 0);
 435                 }
 436
 437                 /* Account for the fact that we have a non-leaf node at
 438                  * the current depth.  */
 439                 len_counts[len]--;
 440                 len_counts[len + 1] += 2;
 441         }
 442 }
 443
 444 /*
 445  * Generate the codewords for a canonical Huffman code.
 446  *
 447  * @A
 448  *      The output array for codewords.  In addition, initially this
 449  *      array must contain the symbols, sorted primarily by frequency and
 450  *      secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
 451  *      each entry.
 452  *
 453  * @len
 454  *      Output array for codeword lengths.
 455  *
 456  * @len_counts
 457  *      An array that provides the number of codewords that will have
 458  *      each possible length <= max_codeword_len.
 459  *
 460  * @max_codeword_len
 461  *      Maximum length, in bits, of each codeword.
 462  *
 463  * @num_syms
 464  *      Number of symbols in the alphabet, including symbols with zero
 465  *      frequency.  This is the length of the 'A' and 'len' arrays.
 466  */
 467 static void
 468 gen_codewords(u32 A[restrict], u8 lens[restrict],
 469               const unsigned len_counts[restrict],
 470               unsigned max_codeword_len, unsigned num_syms)
 471 {
 472         u32 next_codewords[max_codeword_len + 1];
 473
 474         /* Given the number of codewords that will have each length,
 475          * assign codeword lengths to symbols.  We do this by assigning
 476          * the lengths in decreasing order to the symbols sorted
 477          * primarily by increasing frequency and secondarily by
 478          * increasing symbol value.  */
 479         for (unsigned i = 0, len = max_codeword_len; len >= 1; len--) {
 480                 unsigned count = len_counts[len];
 481                 while (count--)
 482                         lens[A[i++] & SYMBOL_MASK] = len;
 483         }
 484
 485         /* Generate the codewords themselves.  We initialize the
 486          * 'next_codewords' array to provide the lexicographically first
 487          * codeword of each length, then assign codewords in symbol
 488          * order.  This produces a canonical code.  */
 489         next_codewords[0] = 0;
 490         next_codewords[1] = 0;
 491         for (unsigned len = 2; len <= max_codeword_len; len++)
 492                 next_codewords[len] =
 493                         (next_codewords[len - 1] + len_counts[len - 1]) << 1;
 494
 495         for (unsigned sym = 0; sym < num_syms; sym++)
 496                 A[sym] = next_codewords[lens[sym]]++;
 497 }
 498
 499 /*
 500  * ---------------------------------------------------------------------
 501  *                      make_canonical_huffman_code()
 502  * ---------------------------------------------------------------------
 503  *
 504  * Given an alphabet and the frequency of each symbol in it, construct a
 505  * length-limited canonical Huffman code.
 506  *
 507  * @num_syms
 508  *      The number of symbols in the alphabet.  The symbols are the
 509  *      integers in the range [0, num_syms - 1].  This parameter must be
 510  *      at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
 511  *
 512  * @max_codeword_len
 513  *      The maximum permissible codeword length.
 514  *
 515  * @freqs
 516  *      An array of @num_syms entries, each of which specifies the
 517  *      frequency of the corresponding symbol.  It is valid for some,
 518  *      none, or all of the frequencies to be 0.
 519  *
 520  * @lens
 521  *      An array of @num_syms entries in which this function will return
 522  *      the length, in bits, of the codeword assigned to each symbol.
 523  *      Symbols with 0 frequency will not have codewords per se, but
 524  *      their entries in this array will be set to 0.  No lengths greater
 525  *      than @max_codeword_len will be assigned.
 526  *
 527  * @codewords
 528  *      An array of @num_syms entries in which this function will return
 529  *      the codeword for each symbol, right-justified and padded on the
 530  *      left with zeroes.  Codewords for symbols with 0 frequency will be
 531  *      undefined.
 532  *
 533  * ---------------------------------------------------------------------
 534  *
 535  * This function builds a length-limited canonical Huffman code.
 536  *
 537  * A length-limited Huffman code contains no codewords longer than some
 538  * specified length, and has exactly (with some algorithms) or
 539  * approximately (with the algorithm used here) the minimum weighted path
 540  * length from the root, given this constraint.
 541  *
 542  * A canonical Huffman code satisfies the properties that a longer
 543  * codeword never lexicographically precedes a shorter codeword, and the
 544  * lexicographic ordering of codewords of the same length is the same as
 545  * the lexicographic ordering of the corresponding symbols.  A canonical
 546  * Huffman code, or more generally a canonical prefix code, can be
 547  * reconstructed from only a list containing the codeword length of each
 548  * symbol.
 549  *
 550  * The classic algorithm to generate a Huffman code creates a node for
 551  * each symbol, then inserts these nodes into a min-heap keyed by symbol
 552  * frequency.  Then, repeatedly, the two lowest-frequency nodes are
 553  * removed from the min-heap and added as the children of a new node
 554  * having frequency equal to the sum of its two children, which is then
 555  * inserted into the min-heap.  When only a single node remains in the
 556  * min-heap, it is the root of the Huffman tree.  The codeword for each
 557  * symbol is determined by the path needed to reach the corresponding
 558  * node from the root.  Descending to the left child appends a 0 bit,
 559  * whereas descending to the right child appends a 1 bit.
 560  *
 561  * The classic algorithm is relatively easy to understand, but it is
 562  * subject to a number of inefficiencies.  In practice, it is fastest to
 563  * first sort the symbols by frequency.  (This itself can be subject to
 564  * an optimization based on the fact that most frequencies tend to be
 565  * low.)  At the same time, we sort secondarily by symbol value, which
 566  * aids the process of generating a canonical code.  Then, during tree
 567  * construction, no heap is necessary because both the leaf nodes and the
 568  * unparented non-leaf nodes can be easily maintained in sorted order.
 569  * Consequently, there can never be more than two possibilities for the
 570  * next-lowest-frequency node.
 571  *
 572  * In addition, because we're generating a canonical code, we actually
 573  * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
 574  * This is because for canonical code generation we don't need to know
 575  * where the symbols are in the tree.  Rather, we only need to know how
 576  * many leaf nodes have each depth (codeword length).  And this
 577  * information can, in fact, be quickly generated from the tree of
 578  * non-leaves only.
 579  *
 580  * Furthermore, we can build this stripped-down Huffman tree directly in
 581  * the array in which the codewords are to be generated, provided that
 582  * these array slots are large enough to hold a symbol and frequency
 583  * value.
 584  *
 585  * Still furthermore, we don't even need to maintain explicit child
 586  * pointers.  We only need the parent pointers, and even those can be
 587  * overwritten in-place with depth information as part of the process of
 588  * extracting codeword lengths from the tree.  So in summary, we do NOT
 589  * need a big structure like:
 590  *
 591  *      struct huffman_tree_node {
 592  *              unsigned int symbol;
 593  *              unsigned int frequency;
 594  *              unsigned int depth;
 595  *              struct huffman_tree_node *left_child;
 596  *              struct huffman_tree_node *right_child;
 597  *      };
 598  *
 599  *
 600  *   ... which often gets used in "naive" implementations of Huffman code
 601  *   generation.
 602  *
 603  * Most of these optimizations are based on the implementation in 7-Zip
 604  * (source file: C/HuffEnc.c), which has been placed in the public domain
 605  * by Igor Pavlov.  But I've rewritten the code with extensive comments,
 606  * as it took me a while to figure out what it was doing...!
 607  *
 608  * ---------------------------------------------------------------------
 609  *
 610  * NOTE: in general, the same frequencies can be used to generate
 611  * different length-limited canonical Huffman codes.  One choice we have
 612  * is during tree construction, when we must decide whether to prefer a
 613  * leaf or non-leaf when there is a tie in frequency.  Another choice we
 614  * have is how to deal with codewords that would exceed @max_codeword_len
 615  * bits in length.  Both of these choices affect the resulting codeword
 616  * lengths, which otherwise can be mapped uniquely onto the resulting
 617  * canonical Huffman code.
 618  *
 619  * Normally, there is no problem with choosing one valid code over
 620  * another, provided that they produce similar compression ratios.
 621  * However, the LZMS compression format uses adaptive Huffman coding.  It
 622  * requires that both the decompressor and compressor build a canonical
 623  * code equivalent to that which can be generated by using the classic
 624  * Huffman tree construction algorithm and always processing leaves
 625  * before non-leaves when there is a frequency tie.  Therefore, we make
 626  * sure to do this.  This method also has the advantage of sometimes
 627  * shortening the longest codeword that is generated.
 628  *
 629  * There also is the issue of how codewords longer than @max_codeword_len
 630  * are dealt with.  Fortunately, for LZMS this is irrelevant because
 631  * because for the LZMS alphabets no codeword can ever exceed
 632  * LZMS_MAX_CODEWORD_LEN (= 15).  Since the LZMS algorithm regularly
 633  * halves all frequencies, the frequencies cannot become high enough for
 634  * a length 16 codeword to be generated.  Specifically, I think that if
 635  * ties are broken in favor of non-leaves (as we do), the lowest total
 636  * frequency that would give a length-16 codeword would be the sum of the
 637  * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which
 638  * is 3570.  And in LZMS we can't get a frequency that high based on the
 639  * alphabet sizes, rebuild frequencies, and scaling factors.  This
 640  * worst-case scenario is based on the following degenerate case (only
 641  * the bottom of the tree shown):
 642  *
 643  *                          ...
 644  *                        17
 645  *                       /  \
 646  *                      10   7
 647  *                     / \
 648  *                    6   4
 649  *                   / \
 650  *                  3   3
 651  *                 / \
 652  *                2   1
 653  *               / \
 654  *              1   1
 655  *
 656  * Excluding the first leaves (those with value 1), each leaf value must
 657  * be greater than the non-leaf up 1 and down 2 from it; otherwise that
 658  * leaf would have taken precedence over that non-leaf and been combined
 659  * with the leaf below, thereby decreasing the height compared to that
 660  * shown.
 661  *
 662  * Interesting fact: if we were to instead prioritize non-leaves over
 663  * leaves, then the worst case frequencies would be the Fibonacci
 664  * sequence, plus an extra frequency of 1.  In this hypothetical
 665  * scenario, it would be slightly easier for longer codewords to be
 666  * generated.
 667  */
 668 void
 669 make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
 670                             const u32 freqs[restrict],
 671                             u8 lens[restrict], u32 codewords[restrict])
 672 {
 673         u32 *A = codewords;
 674         unsigned num_used_syms;
 675
 676         /* Assumptions  */
 677         wimlib_assert2(num_syms >= 2);
 678         wimlib_assert2(num_syms <= (1 << NUM_SYMBOL_BITS));
 679         wimlib_assert2((1ULL << max_codeword_len) >= num_syms);
 680         wimlib_assert2(max_codeword_len <= 32);
 681
 682         /* We begin by sorting the symbols primarily by frequency and
 683          * secondarily by symbol value.  As an optimization, the array
 684          * used for this purpose ('A') shares storage with the space in
 685          * which we will eventually return the codewords.  */
 686
 687         num_used_syms = sort_symbols(num_syms, freqs, lens, A);
 688
 689         /* 'num_used_syms' is the number of symbols with nonzero
 690          * frequency.  This may be less than @num_syms.  'num_used_syms'
 691          * is also the number of entries in 'A' that are valid.  Each
 692          * entry consists of a distinct symbol and a nonzero frequency
 693          * packed into a 32-bit integer.  */
 694
 695         /* Handle special cases where only 0 or 1 symbols were used (had
 696          * nonzero frequency).  */
 697
 698         if (unlikely(num_used_syms == 0)) {
 699                 /* Code is empty.  sort_symbols() already set all lengths
 700                  * to 0, so there is nothing more to do.  */
 701                 return;
 702         }
 703
 704         if (unlikely(num_used_syms == 1)) {
 705                 /* Only one symbol was used, so we only need one
 706                  * codeword.  But two codewords are needed to form the
 707                  * smallest complete Huffman code, which uses codewords 0
 708                  * and 1.  Therefore, we choose another symbol to which
 709                  * to assign a codeword.  We use 0 (if the used symbol is
 710                  * not 0) or 1 (if the used symbol is 0).  In either
 711                  * case, the lesser-valued symbol must be assigned
 712                  * codeword 0 so that the resulting code is canonical.  */
 713
 714                 unsigned sym = A[0] & SYMBOL_MASK;
 715                 unsigned nonzero_idx = sym ? sym : 1;
 716
 717                 codewords[0] = 0;
 718                 lens[0] = 1;
 719                 codewords[nonzero_idx] = 1;
 720                 lens[nonzero_idx] = 1;
 721                 return;
 722         }
 723
 724         /* Build a stripped-down version of the Huffman tree, sharing the
 725          * array 'A' with the symbol values.  Then extract length counts
 726          * from the tree and use them to generate the final codewords.  */
 727
 728         build_tree(A, num_used_syms);
 729
 730         {
 731                 unsigned len_counts[max_codeword_len + 1];
 732
 733                 compute_length_counts(A, num_used_syms - 2,
 734                                       len_counts, max_codeword_len);
 735
 736                 gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
 737         }
 738 }