wimlib.net Git - wimlib/blob - src/compress_common.c

   1 /*
   2  * compress_common.c
   3  *
   4  * Code for compression shared among multiple compression formats.
   5  *
   6  * Copyright 2022 Eric Biggers
   7  *
   8  * Permission is hereby granted, free of charge, to any person
   9  * obtaining a copy of this software and associated documentation
  10  * files (the "Software"), to deal in the Software without
  11  * restriction, including without limitation the rights to use,
  12  * copy, modify, merge, publish, distribute, sublicense, and/or sell
  13  * copies of the Software, and to permit persons to whom the
  14  * Software is furnished to do so, subject to the following
  15  * conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be
  18  * included in all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  22  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  23  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  24  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  25  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  27  * OTHER DEALINGS IN THE SOFTWARE.
  28  */
  29
  30 #ifdef HAVE_CONFIG_H
  31 #  include "config.h"
  32 #endif
  33
  34 #include <string.h>
  35
  36 #include "wimlib/assert.h"
  37 #include "wimlib/compress_common.h"
  38 #include "wimlib/util.h"
  39
  40 /*
  41  * Given the binary tree node A[subtree_idx] whose children already satisfy the
  42  * maxheap property, swap the node with its greater child until it is greater
  43  * than or equal to both of its children, so that the maxheap property is
  44  * satisfied in the subtree rooted at A[subtree_idx].  'A' uses 1-based indices.
  45  */
  46 static void
  47 heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
  48 {
  49         unsigned parent_idx;
  50         unsigned child_idx;
  51         u32 v;
  52
  53         v = A[subtree_idx];
  54         parent_idx = subtree_idx;
  55         while ((child_idx = parent_idx * 2) <= length) {
  56                 if (child_idx < length && A[child_idx + 1] > A[child_idx])
  57                         child_idx++;
  58                 if (v >= A[child_idx])
  59                         break;
  60                 A[parent_idx] = A[child_idx];
  61                 parent_idx = child_idx;
  62         }
  63         A[parent_idx] = v;
  64 }
  65
  66 /*
  67  * Rearrange the array 'A' so that it satisfies the maxheap property.
  68  * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
  69  */
  70 static void
  71 heapify_array(u32 A[], unsigned length)
  72 {
  73         unsigned subtree_idx;
  74
  75         for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
  76                 heapify_subtree(A, length, subtree_idx);
  77 }
  78
  79 /*
  80  * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
  81  *
  82  * Note: name this function heap_sort() instead of heapsort() to avoid colliding
  83  * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
  84  * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
  85  */
  86 static void
  87 heap_sort(u32 A[], unsigned length)
  88 {
  89         A--; /* Use 1-based indices  */
  90
  91         heapify_array(A, length);
  92
  93         while (length >= 2) {
  94                 u32 tmp = A[length];
  95
  96                 A[length] = A[1];
  97                 A[1] = tmp;
  98                 length--;
  99                 heapify_subtree(A, length, 1);
 100         }
 101 }
 102
 103 #define NUM_SYMBOL_BITS 10
 104 #define NUM_FREQ_BITS   (32 - NUM_SYMBOL_BITS)
 105 #define SYMBOL_MASK     ((1 << NUM_SYMBOL_BITS) - 1)
 106 #define FREQ_MASK       (~SYMBOL_MASK)
 107
 108 #define GET_NUM_COUNTERS(num_syms)      (num_syms)
 109
 110 /*
 111  * Sort the symbols primarily by frequency and secondarily by symbol value.
 112  * Discard symbols with zero frequency and fill in an array with the remaining
 113  * symbols, along with their frequencies.  The low NUM_SYMBOL_BITS bits of each
 114  * array entry will contain the symbol value, and the remaining bits will
 115  * contain the frequency.
 116  *
 117  * @num_syms
 118  *      Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS.
 119  *
 120  * @freqs[num_syms]
 121  *      Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1.
 122  *
 123  * @lens[num_syms]
 124  *      An array that eventually will hold the length of each codeword.  This
 125  *      function only fills in the codeword lengths for symbols that have zero
 126  *      frequency, which are not well defined per se but will be set to 0.
 127  *
 128  * @symout[num_syms]
 129  *      The output array, described above.
 130  *
 131  * Returns the number of entries in 'symout' that were filled.  This is the
 132  * number of symbols that have nonzero frequency.
 133  */
 134 static unsigned
 135 sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[])
 136 {
 137         unsigned sym;
 138         unsigned i;
 139         unsigned num_used_syms;
 140         unsigned num_counters;
 141         unsigned counters[GET_NUM_COUNTERS(MAX_NUM_SYMS)];
 142
 143         /*
 144          * We use heapsort, but with an added optimization.  Since often most
 145          * symbol frequencies are low, we first do a count sort using a limited
 146          * number of counters.  High frequencies are counted in the last
 147          * counter, and only they will be sorted with heapsort.
 148          *
 149          * Note: with more symbols, it is generally beneficial to have more
 150          * counters.  About 1 counter per symbol seems fastest.
 151          */
 152
 153         num_counters = GET_NUM_COUNTERS(num_syms);
 154
 155         memset(counters, 0, num_counters * sizeof(counters[0]));
 156
 157         /* Count the frequencies. */
 158         for (sym = 0; sym < num_syms; sym++)
 159                 counters[MIN(freqs[sym], num_counters - 1)]++;
 160
 161         /*
 162          * Make the counters cumulative, ignoring the zero-th, which counted
 163          * symbols with zero frequency.  As a side effect, this calculates the
 164          * number of symbols with nonzero frequency.
 165          */
 166         num_used_syms = 0;
 167         for (i = 1; i < num_counters; i++) {
 168                 unsigned count = counters[i];
 169
 170                 counters[i] = num_used_syms;
 171                 num_used_syms += count;
 172         }
 173
 174         /*
 175          * Sort nonzero-frequency symbols using the counters.  At the same time,
 176          * set the codeword lengths of zero-frequency symbols to 0.
 177          */
 178         for (sym = 0; sym < num_syms; sym++) {
 179                 u32 freq = freqs[sym];
 180
 181                 if (freq != 0) {
 182                         symout[counters[MIN(freq, num_counters - 1)]++] =
 183                                 sym | (freq << NUM_SYMBOL_BITS);
 184                 } else {
 185                         lens[sym] = 0;
 186                 }
 187         }
 188
 189         /* Sort the symbols counted in the last counter. */
 190         heap_sort(symout + counters[num_counters - 2],
 191                   counters[num_counters - 1] - counters[num_counters - 2]);
 192
 193         return num_used_syms;
 194 }
 195
 196 /*
 197  * Build a Huffman tree.
 198  *
 199  * This is an optimized implementation that
 200  *      (a) takes advantage of the frequencies being already sorted;
 201  *      (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
 202  *          tree are sufficient to generate a canonical code;
 203  *      (c) Only stores parent pointers, not child pointers;
 204  *      (d) Produces the nodes in the same memory used for input frequency
 205  *          information.
 206  *
 207  * Array 'A', which contains 'sym_count' entries, is used for both input and
 208  * output.  For this function, 'sym_count' must be at least 2.
 209  *
 210  * For input, the array must contain the frequencies of the symbols, sorted in
 211  * increasing order.  Specifically, each entry must contain a frequency left
 212  * shifted by NUM_SYMBOL_BITS bits.  Any data in the low NUM_SYMBOL_BITS bits of
 213  * the entries will be ignored by this function.  Although these bits will, in
 214  * fact, contain the symbols that correspond to the frequencies, this function
 215  * is concerned with frequencies only and keeps the symbols as-is.
 216  *
 217  * For output, this function will produce the non-leaf nodes of the Huffman
 218  * tree.  These nodes will be stored in the first (sym_count - 1) entries of the
 219  * array.  Entry A[sym_count - 2] will represent the root node.  Each other node
 220  * will contain the zero-based index of its parent node in 'A', left shifted by
 221  * NUM_SYMBOL_BITS bits.  The low NUM_SYMBOL_BITS bits of each entry in A will
 222  * be kept as-is.  Again, note that although these low bits will, in fact,
 223  * contain a symbol value, this symbol will have *no relationship* with the
 224  * Huffman tree node that happens to occupy the same slot.  This is because this
 225  * implementation only generates the non-leaf nodes of the tree.
 226  */
 227 static void
 228 build_tree(u32 A[], unsigned sym_count)
 229 {
 230         const unsigned last_idx = sym_count - 1;
 231
 232         /* Index of the next lowest frequency leaf that still needs a parent */
 233         unsigned i = 0;
 234
 235         /*
 236          * Index of the next lowest frequency non-leaf that still needs a
 237          * parent, or 'e' if there is currently no such node
 238          */
 239         unsigned b = 0;
 240
 241         /* Index of the next spot for a non-leaf (will overwrite a leaf) */
 242         unsigned e = 0;
 243
 244         do {
 245                 u32 new_freq;
 246
 247                 /*
 248                  * Select the next two lowest frequency nodes among the leaves
 249                  * A[i] and non-leaves A[b], and create a new node A[e] to be
 250                  * their parent.  Set the new node's frequency to the sum of the
 251                  * frequencies of its two children.
 252                  *
 253                  * Usually the next two lowest frequency nodes are of the same
 254                  * type (leaf or non-leaf), so check those cases first.
 255                  */
 256                 if (i + 1 <= last_idx &&
 257                     (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) {
 258                         /* Two leaves */
 259                         new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK);
 260                         i += 2;
 261                 } else if (b + 2 <= e &&
 262                            (i > last_idx ||
 263                             (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) {
 264                         /* Two non-leaves */
 265                         new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK);
 266                         A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
 267                         A[b + 1] = (e << NUM_SYMBOL_BITS) |
 268                                    (A[b + 1] & SYMBOL_MASK);
 269                         b += 2;
 270                 } else {
 271                         /* One leaf and one non-leaf */
 272                         new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK);
 273                         A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
 274                         i++;
 275                         b++;
 276                 }
 277                 A[e] = new_freq | (A[e] & SYMBOL_MASK);
 278                 /*
 279                  * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the
 280                  * tree is complete once we've created 'n - 1' non-leaves.
 281                  */
 282         } while (++e < last_idx);
 283 }
 284
 285 /*
 286  * Given the stripped-down Huffman tree constructed by build_tree(), determine
 287  * the number of codewords that should be assigned each possible length, taking
 288  * into account the length-limited constraint.
 289  *
 290  * @A
 291  *      The array produced by build_tree(), containing parent index information
 292  *      for the non-leaf nodes of the Huffman tree.  Each entry in this array is
 293  *      a node; a node's parent always has a greater index than that node
 294  *      itself.  This function will overwrite the parent index information in
 295  *      this array, so essentially it will destroy the tree.  However, the data
 296  *      in the low NUM_SYMBOL_BITS of each entry will be preserved.
 297  *
 298  * @root_idx
 299  *      The 0-based index of the root node in 'A', and consequently one less
 300  *      than the number of tree node entries in 'A'.  (Or, really 2 less than
 301  *      the actual length of 'A'.)
 302  *
 303  * @len_counts
 304  *      An array of length ('max_codeword_len' + 1) in which the number of
 305  *      codewords having each length <= max_codeword_len will be returned.
 306  *
 307  * @max_codeword_len
 308  *      The maximum permissible codeword length.
 309  */
 310 static void
 311 compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
 312                       unsigned max_codeword_len)
 313 {
 314         unsigned len;
 315         int node;
 316
 317         /*
 318          * The key observations are:
 319          *
 320          * (1) We can traverse the non-leaf nodes of the tree, always visiting a
 321          *     parent before its children, by simply iterating through the array
 322          *     in reverse order.  Consequently, we can compute the depth of each
 323          *     node in one pass, overwriting the parent indices with depths.
 324          *
 325          * (2) We can initially assume that in the real Huffman tree, both
 326          *     children of the root are leaves.  This corresponds to two
 327          *     codewords of length 1.  Then, whenever we visit a (non-leaf) node
 328          *     during the traversal, we modify this assumption to account for
 329          *     the current node *not* being a leaf, but rather its two children
 330          *     being leaves.  This causes the loss of one codeword for the
 331          *     current depth and the addition of two codewords for the current
 332          *     depth plus one.
 333          *
 334          * (3) We can handle the length-limited constraint fairly easily by
 335          *     simply using the largest length available when a depth exceeds
 336          *     max_codeword_len.
 337          */
 338
 339         for (len = 0; len <= max_codeword_len; len++)
 340                 len_counts[len] = 0;
 341         len_counts[1] = 2;
 342
 343         /* Set the root node's depth to 0. */
 344         A[root_idx] &= SYMBOL_MASK;
 345
 346         for (node = root_idx - 1; node >= 0; node--) {
 347
 348                 /* Calculate the depth of this node. */
 349
 350                 unsigned parent = A[node] >> NUM_SYMBOL_BITS;
 351                 unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
 352                 unsigned depth = parent_depth + 1;
 353                 unsigned len = depth;
 354
 355                 /*
 356                  * Set the depth of this node so that it is available when its
 357                  * children (if any) are processed.
 358                  */
 359                 A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
 360
 361                 /*
 362                  * If needed, decrease the length to meet the length-limited
 363                  * constraint.  This is not the optimal method for generating
 364                  * length-limited Huffman codes!  But it should be good enough.
 365                  */
 366                 if (len >= max_codeword_len) {
 367                         len = max_codeword_len;
 368                         do {
 369                                 len--;
 370                         } while (len_counts[len] == 0);
 371                 }
 372
 373                 /*
 374                  * Account for the fact that we have a non-leaf node at the
 375                  * current depth.
 376                  */
 377                 len_counts[len]--;
 378                 len_counts[len + 1] += 2;
 379         }
 380 }
 381
 382 /*
 383  * Generate the codewords for a canonical Huffman code.
 384  *
 385  * @A
 386  *      The output array for codewords.  In addition, initially this
 387  *      array must contain the symbols, sorted primarily by frequency and
 388  *      secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
 389  *      each entry.
 390  *
 391  * @len
 392  *      Output array for codeword lengths.
 393  *
 394  * @len_counts
 395  *      An array that provides the number of codewords that will have
 396  *      each possible length <= max_codeword_len.
 397  *
 398  * @max_codeword_len
 399  *      Maximum length, in bits, of each codeword.
 400  *
 401  * @num_syms
 402  *      Number of symbols in the alphabet, including symbols with zero
 403  *      frequency.  This is the length of the 'A' and 'len' arrays.
 404  */
 405 static void
 406 gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
 407               unsigned max_codeword_len, unsigned num_syms)
 408 {
 409         u32 next_codewords[MAX_CODEWORD_LEN + 1];
 410         unsigned i;
 411         unsigned len;
 412         unsigned sym;
 413
 414         /*
 415          * Given the number of codewords that will have each length, assign
 416          * codeword lengths to symbols.  We do this by assigning the lengths in
 417          * decreasing order to the symbols sorted primarily by increasing
 418          * frequency and secondarily by increasing symbol value.
 419          */
 420         for (i = 0, len = max_codeword_len; len >= 1; len--) {
 421                 unsigned count = len_counts[len];
 422
 423                 while (count--)
 424                         lens[A[i++] & SYMBOL_MASK] = len;
 425         }
 426
 427         /*
 428          * Generate the codewords themselves.  We initialize the
 429          * 'next_codewords' array to provide the lexicographically first
 430          * codeword of each length, then assign codewords in symbol order.  This
 431          * produces a canonical code.
 432          */
 433         next_codewords[0] = 0;
 434         next_codewords[1] = 0;
 435         for (len = 2; len <= max_codeword_len; len++)
 436                 next_codewords[len] =
 437                         (next_codewords[len - 1] + len_counts[len - 1]) << 1;
 438
 439         for (sym = 0; sym < num_syms; sym++)
 440                 A[sym] = next_codewords[lens[sym]]++;
 441 }
 442
 443 /*
 444  * ---------------------------------------------------------------------
 445  *                      make_canonical_huffman_code()
 446  * ---------------------------------------------------------------------
 447  *
 448  * Given an alphabet and the frequency of each symbol in it, construct a
 449  * length-limited canonical Huffman code.
 450  *
 451  * @num_syms
 452  *      The number of symbols in the alphabet.  The symbols are the integers in
 453  *      the range [0, num_syms - 1].  This parameter must be at least 2 and
 454  *      must not exceed (1 << NUM_SYMBOL_BITS).
 455  *
 456  * @max_codeword_len
 457  *      The maximum permissible codeword length.
 458  *
 459  * @freqs
 460  *      An array of length @num_syms that gives the frequency of each symbol.
 461  *      It is valid for some, none, or all of the frequencies to be 0.  The sum
 462  *      of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1.
 463  *
 464  * @lens
 465  *      An array of @num_syms entries in which this function will return the
 466  *      length, in bits, of the codeword assigned to each symbol.  Symbols with
 467  *      0 frequency will not have codewords per se, but their entries in this
 468  *      array will be set to 0.  No lengths greater than @max_codeword_len will
 469  *      be assigned.
 470  *
 471  * @codewords
 472  *      An array of @num_syms entries in which this function will return the
 473  *      codeword for each symbol, right-justified and padded on the left with
 474  *      zeroes.  Codewords for symbols with 0 frequency will be undefined.
 475  *
 476  * ---------------------------------------------------------------------
 477  *
 478  * This function builds a length-limited canonical Huffman code.
 479  *
 480  * A length-limited Huffman code contains no codewords longer than some
 481  * specified length, and has exactly (with some algorithms) or approximately
 482  * (with the algorithm used here) the minimum weighted path length from the
 483  * root, given this constraint.
 484  *
 485  * A canonical Huffman code satisfies the properties that a longer codeword
 486  * never lexicographically precedes a shorter codeword, and the lexicographic
 487  * ordering of codewords of the same length is the same as the lexicographic
 488  * ordering of the corresponding symbols.  A canonical Huffman code, or more
 489  * generally a canonical prefix code, can be reconstructed from only a list
 490  * containing the codeword length of each symbol.
 491  *
 492  * The classic algorithm to generate a Huffman code creates a node for each
 493  * symbol, then inserts these nodes into a min-heap keyed by symbol frequency.
 494  * Then, repeatedly, the two lowest-frequency nodes are removed from the
 495  * min-heap and added as the children of a new node having frequency equal to
 496  * the sum of its two children, which is then inserted into the min-heap.  When
 497  * only a single node remains in the min-heap, it is the root of the Huffman
 498  * tree.  The codeword for each symbol is determined by the path needed to reach
 499  * the corresponding node from the root.  Descending to the left child appends a
 500  * 0 bit, whereas descending to the right child appends a 1 bit.
 501  *
 502  * The classic algorithm is relatively easy to understand, but it is subject to
 503  * a number of inefficiencies.  In practice, it is fastest to first sort the
 504  * symbols by frequency.  (This itself can be subject to an optimization based
 505  * on the fact that most frequencies tend to be low.)  At the same time, we sort
 506  * secondarily by symbol value, which aids the process of generating a canonical
 507  * code.  Then, during tree construction, no heap is necessary because both the
 508  * leaf nodes and the unparented non-leaf nodes can be easily maintained in
 509  * sorted order.  Consequently, there can never be more than two possibilities
 510  * for the next-lowest-frequency node.
 511  *
 512  * In addition, because we're generating a canonical code, we actually don't
 513  * need the leaf nodes of the tree at all, only the non-leaf nodes.  This is
 514  * because for canonical code generation we don't need to know where the symbols
 515  * are in the tree.  Rather, we only need to know how many leaf nodes have each
 516  * depth (codeword length).  And this information can, in fact, be quickly
 517  * generated from the tree of non-leaves only.
 518  *
 519  * Furthermore, we can build this stripped-down Huffman tree directly in the
 520  * array in which the codewords are to be generated, provided that these array
 521  * slots are large enough to hold a symbol and frequency value.
 522  *
 523  * Still furthermore, we don't even need to maintain explicit child pointers.
 524  * We only need the parent pointers, and even those can be overwritten in-place
 525  * with depth information as part of the process of extracting codeword lengths
 526  * from the tree.  So in summary, we do NOT need a big structure like:
 527  *
 528  *      struct huffman_tree_node {
 529  *              unsigned int symbol;
 530  *              unsigned int frequency;
 531  *              unsigned int depth;
 532  *              struct huffman_tree_node *left_child;
 533  *              struct huffman_tree_node *right_child;
 534  *      };
 535  *
 536  *
 537  * ... which often gets used in "naive" implementations of Huffman code
 538  * generation.
 539  *
 540  * Many of these optimizations are based on the implementation in 7-Zip (source
 541  * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov.
 542  *
 543  * NOTE: in general, the same frequencies can be used to generate different
 544  * length-limited canonical Huffman codes.  One choice we have is during tree
 545  * construction, when we must decide whether to prefer a leaf or non-leaf when
 546  * there is a tie in frequency.  Another choice we have is how to deal with
 547  * codewords that would exceed @max_codeword_len bits in length.  Both of these
 548  * choices affect the resulting codeword lengths, which otherwise can be mapped
 549  * uniquely onto the resulting canonical Huffman code.
 550  *
 551  * Normally, there is no problem with choosing one valid code over another,
 552  * provided that they produce similar compression ratios.  However, the LZMS
 553  * compression format uses adaptive Huffman coding.  It requires that both the
 554  * decompressor and compressor build a canonical code equivalent to that which
 555  * can be generated by using the classic Huffman tree construction algorithm and
 556  * always processing leaves before non-leaves when there is a frequency tie.
 557  * Therefore, we make sure to do this.  This method also has the advantage of
 558  * sometimes shortening the longest codeword that is generated.
 559  *
 560  * There also is the issue of how codewords longer than @max_codeword_len are
 561  * dealt with.  Fortunately, for LZMS this is irrelevant because for the LZMS
 562  * alphabets no codeword can ever exceed LZMS_MAX_CODEWORD_LEN (= 15).  Since
 563  * the LZMS algorithm regularly halves all frequencies, the frequencies cannot
 564  * become high enough for a length 16 codeword to be generated.  Specifically, I
 565  * think that if ties are broken in favor of non-leaves (as we do), the lowest
 566  * total frequency that would give a length-16 codeword would be the sum of the
 567  * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which is
 568  * 3570.  And in LZMS we can't get a frequency that high based on the alphabet
 569  * sizes, rebuild frequencies, and scaling factors.  This worst-case scenario is
 570  * based on the following degenerate case (only the bottom of the tree shown):
 571  *
 572  *                          ...
 573  *                        17
 574  *                       /  \
 575  *                      10   7
 576  *                     / \
 577  *                    6   4
 578  *                   / \
 579  *                  3   3
 580  *                 / \
 581  *                2   1
 582  *               / \
 583  *              1   1
 584  *
 585  * Excluding the first leaves (those with value 1), each leaf value must be
 586  * greater than the non-leaf up 1 and down 2 from it; otherwise that leaf would
 587  * have taken precedence over that non-leaf and been combined with the leaf
 588  * below, thereby decreasing the height compared to that shown.
 589  *
 590  * Interesting fact: if we were to instead prioritize non-leaves over leaves,
 591  * then the worst case frequencies would be the Fibonacci sequence, plus an
 592  * extra frequency of 1.  In this hypothetical scenario, it would be slightly
 593  * easier for longer codewords to be generated.
 594  */
 595 void
 596 make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
 597                             const u32 freqs[], u8 lens[], u32 codewords[])
 598 {
 599         u32 *A = codewords;
 600         unsigned num_used_syms;
 601
 602         wimlib_assert(num_syms <= MAX_NUM_SYMS);
 603         STATIC_ASSERT(MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
 604         wimlib_assert(max_codeword_len <= MAX_CODEWORD_LEN);
 605
 606         /*
 607          * We begin by sorting the symbols primarily by frequency and
 608          * secondarily by symbol value.  As an optimization, the array used for
 609          * this purpose ('A') shares storage with the space in which we will
 610          * eventually return the codewords.
 611          */
 612         num_used_syms = sort_symbols(num_syms, freqs, lens, A);
 613
 614         /*
 615          * 'num_used_syms' is the number of symbols with nonzero frequency.
 616          * This may be less than @num_syms.  'num_used_syms' is also the number
 617          * of entries in 'A' that are valid.  Each entry consists of a distinct
 618          * symbol and a nonzero frequency packed into a 32-bit integer.
 619          */
 620
 621         /*
 622          * Handle special cases where only 0 or 1 symbols were used (had nonzero
 623          * frequency).
 624          */
 625
 626         if (unlikely(num_used_syms == 0)) {
 627                 /*
 628                  * Code is empty.  sort_symbols() already set all lengths to 0,
 629                  * so there is nothing more to do.
 630                  */
 631                 return;
 632         }
 633
 634         if (unlikely(num_used_syms == 1)) {
 635                 /*
 636                  * Only one symbol was used, so we only need one codeword.  But
 637                  * two codewords are needed to form the smallest complete
 638                  * Huffman code, which uses codewords 0 and 1.  Therefore, we
 639                  * choose another symbol to which to assign a codeword.  We use
 640                  * 0 (if the used symbol is not 0) or 1 (if the used symbol is
 641                  * 0).  In either case, the lesser-valued symbol must be
 642                  * assigned codeword 0 so that the resulting code is canonical.
 643                  */
 644
 645                 unsigned sym = A[0] & SYMBOL_MASK;
 646                 unsigned nonzero_idx = sym ? sym : 1;
 647
 648                 codewords[0] = 0;
 649                 lens[0] = 1;
 650                 codewords[nonzero_idx] = 1;
 651                 lens[nonzero_idx] = 1;
 652                 return;
 653         }
 654
 655         /*
 656          * Build a stripped-down version of the Huffman tree, sharing the array
 657          * 'A' with the symbol values.  Then extract length counts from the tree
 658          * and use them to generate the final codewords.
 659          */
 660
 661         build_tree(A, num_used_syms);
 662
 663         {
 664                 unsigned len_counts[MAX_CODEWORD_LEN + 1];
 665
 666                 compute_length_counts(A, num_used_syms - 2,
 667                                       len_counts, max_codeword_len);
 668
 669                 gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
 670         }
 671 }