wimlib.net Git - wimlib/blob - src/compress_common.c

   1 /*
   2  * compress_common.c
   3  *
   4  * Code for compression shared among multiple compression formats.
   5  *
   6  * Copyright 2022 Eric Biggers
   7  *
   8  * Permission is hereby granted, free of charge, to any person
   9  * obtaining a copy of this software and associated documentation
  10  * files (the "Software"), to deal in the Software without
  11  * restriction, including without limitation the rights to use,
  12  * copy, modify, merge, publish, distribute, sublicense, and/or sell
  13  * copies of the Software, and to permit persons to whom the
  14  * Software is furnished to do so, subject to the following
  15  * conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be
  18  * included in all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  22  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  23  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  24  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  25  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  27  * OTHER DEALINGS IN THE SOFTWARE.
  28  */
  29
  30 #ifdef HAVE_CONFIG_H
  31 #  include "config.h"
  32 #endif
  33
  34 #include <string.h>
  35
  36 #include "wimlib/compress_common.h"
  37 #include "wimlib/util.h"
  38
  39 /* Given the binary tree node A[subtree_idx] whose children already
  40  * satisfy the maxheap property, swap the node with its greater child
  41  * until it is greater than both its children, so that the maxheap
  42  * property is satisfied in the subtree rooted at A[subtree_idx].  */
  43 static void
  44 heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
  45 {
  46         unsigned parent_idx;
  47         unsigned child_idx;
  48         u32 v;
  49
  50         v = A[subtree_idx];
  51         parent_idx = subtree_idx;
  52         while ((child_idx = parent_idx * 2) <= length) {
  53                 if (child_idx < length && A[child_idx + 1] > A[child_idx])
  54                         child_idx++;
  55                 if (v >= A[child_idx])
  56                         break;
  57                 A[parent_idx] = A[child_idx];
  58                 parent_idx = child_idx;
  59         }
  60         A[parent_idx] = v;
  61 }
  62
  63 /* Rearrange the array 'A' so that it satisfies the maxheap property.
  64  * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
  65  */
  66 static void
  67 heapify_array(u32 A[], unsigned length)
  68 {
  69         for (unsigned subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
  70                 heapify_subtree(A, length, subtree_idx);
  71 }
  72
  73 /* Sort the array 'A', which contains 'length' unsigned 32-bit integers.  */
  74 static void
  75 heapsort(u32 A[], unsigned length)
  76 {
  77         A--; /* Use 1-based indices  */
  78
  79         heapify_array(A, length);
  80
  81         while (length >= 2) {
  82                 swap(A[1], A[length]);
  83                 length--;
  84                 heapify_subtree(A, length, 1);
  85         }
  86 }
  87
  88 #define NUM_SYMBOL_BITS 10
  89 #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
  90
  91 /*
  92  * Sort the symbols primarily by frequency and secondarily by symbol
  93  * value.  Discard symbols with zero frequency and fill in an array with
  94  * the remaining symbols, along with their frequencies.  The low
  95  * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
  96  * value, and the remaining bits will contain the frequency.
  97  *
  98  * @num_syms
  99  *      Number of symbols in the alphabet.
 100  *      Can't be greater than (1 << NUM_SYMBOL_BITS).
 101  *
 102  * @freqs[num_syms]
 103  *      The frequency of each symbol.
 104  *
 105  * @lens[num_syms]
 106  *      An array that eventually will hold the length of each codeword.
 107  *      This function only fills in the codeword lengths for symbols that
 108  *      have zero frequency, which are not well defined per se but will
 109  *      be set to 0.
 110  *
 111  * @symout[num_syms]
 112  *      The output array, described above.
 113  *
 114  * Returns the number of entries in 'symout' that were filled.  This is
 115  * the number of symbols that have nonzero frequency.
 116  */
 117 static unsigned
 118 sort_symbols(unsigned num_syms, const u32 freqs[restrict],
 119              u8 lens[restrict], u32 symout[restrict])
 120 {
 121         unsigned num_used_syms;
 122         unsigned num_counters;
 123
 124         /* We rely on heapsort, but with an added optimization.  Since
 125          * it's common for most symbol frequencies to be low, we first do
 126          * a count sort using a limited number of counters.  High
 127          * frequencies will be counted in the last counter, and only they
 128          * will be sorted with heapsort.
 129          *
 130          * Note: with more symbols, it is generally beneficial to have more
 131          * counters.  About 1 counter per 4 symbols seems fast.
 132          *
 133          * Note: I also tested radix sort, but even for large symbol
 134          * counts (> 255) and frequencies bounded at 16 bits (enabling
 135          * radix sort by just two base-256 digits), it didn't seem any
 136          * faster than the method implemented here.
 137          *
 138          * Note: I tested the optimized quicksort implementation from
 139          * glibc (with indirection overhead removed), but it was only
 140          * marginally faster than the simple heapsort implemented here.
 141          *
 142          * Tests were done with building the codes for LZX.  Results may
 143          * vary for different compression algorithms...!  */
 144
 145         num_counters = ALIGN(DIV_ROUND_UP(num_syms, 4), 4);
 146
 147         unsigned counters[num_counters];
 148
 149         memset(counters, 0, sizeof(counters));
 150
 151         /* Count the frequencies.  */
 152         for (unsigned sym = 0; sym < num_syms; sym++)
 153                 counters[min(freqs[sym], num_counters - 1)]++;
 154
 155         /* Make the counters cumulative, ignoring the zero-th, which
 156          * counted symbols with zero frequency.  As a side effect, this
 157          * calculates the number of symbols with nonzero frequency.  */
 158         num_used_syms = 0;
 159         for (unsigned i = 1; i < num_counters; i++) {
 160                 unsigned count = counters[i];
 161                 counters[i] = num_used_syms;
 162                 num_used_syms += count;
 163         }
 164
 165         /* Sort nonzero-frequency symbols using the counters.  At the
 166          * same time, set the codeword lengths of zero-frequency symbols
 167          * to 0.  */
 168         for (unsigned sym = 0; sym < num_syms; sym++) {
 169                 u32 freq = freqs[sym];
 170                 if (freq != 0) {
 171                         symout[counters[min(freq, num_counters - 1)]++] =
 172                                 sym | (freq << NUM_SYMBOL_BITS);
 173                 } else {
 174                         lens[sym] = 0;
 175                 }
 176         }
 177
 178         /* Sort the symbols counted in the last counter.  */
 179         heapsort(symout + counters[num_counters - 2],
 180                  counters[num_counters - 1] - counters[num_counters - 2]);
 181
 182         return num_used_syms;
 183 }
 184
 185 /*
 186  * Build the Huffman tree.
 187  *
 188  * This is an optimized implementation that
 189  *      (a) takes advantage of the frequencies being already sorted;
 190  *      (b) only generates non-leaf nodes, since the non-leaf nodes of a
 191  *          Huffman tree are sufficient to generate a canonical code;
 192  *      (c) Only stores parent pointers, not child pointers;
 193  *      (d) Produces the nodes in the same memory used for input
 194  *          frequency information.
 195  *
 196  * Array 'A', which contains 'sym_count' entries, is used for both input
 197  * and output.  For this function, 'sym_count' must be at least 2.
 198  *
 199  * For input, the array must contain the frequencies of the symbols,
 200  * sorted in increasing order.  Specifically, each entry must contain a
 201  * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
 202  * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
 203  * Although these bits will, in fact, contain the symbols that correspond
 204  * to the frequencies, this function is concerned with frequencies only
 205  * and keeps the symbols as-is.
 206  *
 207  * For output, this function will produce the non-leaf nodes of the
 208  * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
 209  * entries of the array.  Entry A[sym_count - 2] will represent the root
 210  * node.  Each other node will contain the zero-based index of its parent
 211  * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
 212  * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
 213  * note that although these low bits will, in fact, contain a symbol
 214  * value, this symbol will have *no relationship* with the Huffman tree
 215  * node that happens to occupy the same slot.  This is because this
 216  * implementation only generates the non-leaf nodes of the tree.
 217  */
 218 static void
 219 build_tree(u32 A[], unsigned sym_count)
 220 {
 221         /* Index, in 'A', of next lowest frequency symbol that has not
 222          * yet been processed.  */
 223         unsigned i = 0;
 224
 225         /* Index, in 'A', of next lowest frequency parentless non-leaf
 226          * node; or, if equal to 'e', then no such node exists yet.  */
 227         unsigned b = 0;
 228
 229         /* Index, in 'A', of next node to allocate as a non-leaf.  */
 230         unsigned e = 0;
 231
 232         do {
 233                 unsigned m, n;
 234                 u32 freq_shifted;
 235
 236                 /* Choose the two next lowest frequency entries.  */
 237
 238                 if (i != sym_count &&
 239                     (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
 240                         m = i++;
 241                 else
 242                         m = b++;
 243
 244                 if (i != sym_count &&
 245                     (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
 246                         n = i++;
 247                 else
 248                         n = b++;
 249
 250                 /* Allocate a non-leaf node and link the entries to it.
 251                  *
 252                  * If we link an entry that we're visiting for the first
 253                  * time (via index 'i'), then we're actually linking a
 254                  * leaf node and it will have no effect, since the leaf
 255                  * will be overwritten with a non-leaf when index 'e'
 256                  * catches up to it.  But it's not any slower to
 257                  * unconditionally set the parent index.
 258                  *
 259                  * We also compute the frequency of the non-leaf node as
 260                  * the sum of its two children's frequencies.  */
 261
 262                 freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
 263
 264                 A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
 265                 A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
 266                 A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
 267                 e++;
 268         } while (sym_count - e > 1);
 269                 /* When just one entry remains, it is a "leaf" that was
 270                  * linked to some other node.  We ignore it, since the
 271                  * rest of the array contains the non-leaves which we
 272                  * need.  (Note that we're assuming the cases with 0 or 1
 273                  * symbols were handled separately.) */
 274 }
 275
 276 /*
 277  * Given the stripped-down Huffman tree constructed by build_tree(),
 278  * determine the number of codewords that should be assigned each
 279  * possible length, taking into account the length-limited constraint.
 280  *
 281  * @A
 282  *      The array produced by build_tree(), containing parent index
 283  *      information for the non-leaf nodes of the Huffman tree.  Each
 284  *      entry in this array is a node; a node's parent always has a
 285  *      greater index than that node itself.  This function will
 286  *      overwrite the parent index information in this array, so
 287  *      essentially it will destroy the tree.  However, the data in the
 288  *      low NUM_SYMBOL_BITS of each entry will be preserved.
 289  *
 290  * @root_idx
 291  *      The 0-based index of the root node in 'A', and consequently one
 292  *      less than the number of tree node entries in 'A'.  (Or, really 2
 293  *      less than the actual length of 'A'.)
 294  *
 295  * @len_counts
 296  *      An array of length ('max_codeword_len' + 1) in which the number of
 297  *      codewords having each length <= max_codeword_len will be
 298  *      returned.
 299  *
 300  * @max_codeword_len
 301  *      The maximum permissible codeword length.
 302  */
 303 static void
 304 compute_length_counts(u32 A[restrict], unsigned root_idx,
 305                       unsigned len_counts[restrict], unsigned max_codeword_len)
 306 {
 307         /* The key observations are:
 308          *
 309          * (1) We can traverse the non-leaf nodes of the tree, always
 310          * visiting a parent before its children, by simply iterating
 311          * through the array in reverse order.  Consequently, we can
 312          * compute the depth of each node in one pass, overwriting the
 313          * parent indices with depths.
 314          *
 315          * (2) We can initially assume that in the real Huffman tree,
 316          * both children of the root are leaves.  This corresponds to two
 317          * codewords of length 1.  Then, whenever we visit a (non-leaf)
 318          * node during the traversal, we modify this assumption to
 319          * account for the current node *not* being a leaf, but rather
 320          * its two children being leaves.  This causes the loss of one
 321          * codeword for the current depth and the addition of two
 322          * codewords for the current depth plus one.
 323          *
 324          * (3) We can handle the length-limited constraint fairly easily
 325          * by simply using the largest length available when a depth
 326          * exceeds max_codeword_len.
 327          */
 328
 329         for (unsigned len = 0; len <= max_codeword_len; len++)
 330                 len_counts[len] = 0;
 331         len_counts[1] = 2;
 332
 333         /* Set the root node's depth to 0.  */
 334         A[root_idx] &= SYMBOL_MASK;
 335
 336         for (int node = root_idx - 1; node >= 0; node--) {
 337
 338                 /* Calculate the depth of this node.  */
 339
 340                 unsigned parent = A[node] >> NUM_SYMBOL_BITS;
 341                 unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
 342                 unsigned depth = parent_depth + 1;
 343                 unsigned len = depth;
 344
 345                 /* Set the depth of this node so that it is available
 346                  * when its children (if any) are processed.  */
 347
 348                 A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
 349
 350                 /* If needed, decrease the length to meet the
 351                  * length-limited constraint.  This is not the optimal
 352                  * method for generating length-limited Huffman codes!
 353                  * But it should be good enough.  */
 354                 if (len >= max_codeword_len) {
 355                         len = max_codeword_len;
 356                         do {
 357                                 len--;
 358                         } while (len_counts[len] == 0);
 359                 }
 360
 361                 /* Account for the fact that we have a non-leaf node at
 362                  * the current depth.  */
 363                 len_counts[len]--;
 364                 len_counts[len + 1] += 2;
 365         }
 366 }
 367
 368 /*
 369  * Generate the codewords for a canonical Huffman code.
 370  *
 371  * @A
 372  *      The output array for codewords.  In addition, initially this
 373  *      array must contain the symbols, sorted primarily by frequency and
 374  *      secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
 375  *      each entry.
 376  *
 377  * @len
 378  *      Output array for codeword lengths.
 379  *
 380  * @len_counts
 381  *      An array that provides the number of codewords that will have
 382  *      each possible length <= max_codeword_len.
 383  *
 384  * @max_codeword_len
 385  *      Maximum length, in bits, of each codeword.
 386  *
 387  * @num_syms
 388  *      Number of symbols in the alphabet, including symbols with zero
 389  *      frequency.  This is the length of the 'A' and 'len' arrays.
 390  */
 391 static void
 392 gen_codewords(u32 A[restrict], u8 lens[restrict],
 393               const unsigned len_counts[restrict],
 394               unsigned max_codeword_len, unsigned num_syms)
 395 {
 396         u32 next_codewords[max_codeword_len + 1];
 397
 398         /* Given the number of codewords that will have each length,
 399          * assign codeword lengths to symbols.  We do this by assigning
 400          * the lengths in decreasing order to the symbols sorted
 401          * primarily by increasing frequency and secondarily by
 402          * increasing symbol value.  */
 403         for (unsigned i = 0, len = max_codeword_len; len >= 1; len--) {
 404                 unsigned count = len_counts[len];
 405                 while (count--)
 406                         lens[A[i++] & SYMBOL_MASK] = len;
 407         }
 408
 409         /* Generate the codewords themselves.  We initialize the
 410          * 'next_codewords' array to provide the lexicographically first
 411          * codeword of each length, then assign codewords in symbol
 412          * order.  This produces a canonical code.  */
 413         next_codewords[0] = 0;
 414         next_codewords[1] = 0;
 415         for (unsigned len = 2; len <= max_codeword_len; len++)
 416                 next_codewords[len] =
 417                         (next_codewords[len - 1] + len_counts[len - 1]) << 1;
 418
 419         for (unsigned sym = 0; sym < num_syms; sym++)
 420                 A[sym] = next_codewords[lens[sym]]++;
 421 }
 422
 423 /*
 424  * ---------------------------------------------------------------------
 425  *                      make_canonical_huffman_code()
 426  * ---------------------------------------------------------------------
 427  *
 428  * Given an alphabet and the frequency of each symbol in it, construct a
 429  * length-limited canonical Huffman code.
 430  *
 431  * @num_syms
 432  *      The number of symbols in the alphabet.  The symbols are the
 433  *      integers in the range [0, num_syms - 1].  This parameter must be
 434  *      at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
 435  *
 436  * @max_codeword_len
 437  *      The maximum permissible codeword length.
 438  *
 439  * @freqs
 440  *      An array of @num_syms entries, each of which specifies the
 441  *      frequency of the corresponding symbol.  It is valid for some,
 442  *      none, or all of the frequencies to be 0.
 443  *
 444  * @lens
 445  *      An array of @num_syms entries in which this function will return
 446  *      the length, in bits, of the codeword assigned to each symbol.
 447  *      Symbols with 0 frequency will not have codewords per se, but
 448  *      their entries in this array will be set to 0.  No lengths greater
 449  *      than @max_codeword_len will be assigned.
 450  *
 451  * @codewords
 452  *      An array of @num_syms entries in which this function will return
 453  *      the codeword for each symbol, right-justified and padded on the
 454  *      left with zeroes.  Codewords for symbols with 0 frequency will be
 455  *      undefined.
 456  *
 457  * ---------------------------------------------------------------------
 458  *
 459  * This function builds a length-limited canonical Huffman code.
 460  *
 461  * A length-limited Huffman code contains no codewords longer than some
 462  * specified length, and has exactly (with some algorithms) or
 463  * approximately (with the algorithm used here) the minimum weighted path
 464  * length from the root, given this constraint.
 465  *
 466  * A canonical Huffman code satisfies the properties that a longer
 467  * codeword never lexicographically precedes a shorter codeword, and the
 468  * lexicographic ordering of codewords of the same length is the same as
 469  * the lexicographic ordering of the corresponding symbols.  A canonical
 470  * Huffman code, or more generally a canonical prefix code, can be
 471  * reconstructed from only a list containing the codeword length of each
 472  * symbol.
 473  *
 474  * The classic algorithm to generate a Huffman code creates a node for
 475  * each symbol, then inserts these nodes into a min-heap keyed by symbol
 476  * frequency.  Then, repeatedly, the two lowest-frequency nodes are
 477  * removed from the min-heap and added as the children of a new node
 478  * having frequency equal to the sum of its two children, which is then
 479  * inserted into the min-heap.  When only a single node remains in the
 480  * min-heap, it is the root of the Huffman tree.  The codeword for each
 481  * symbol is determined by the path needed to reach the corresponding
 482  * node from the root.  Descending to the left child appends a 0 bit,
 483  * whereas descending to the right child appends a 1 bit.
 484  *
 485  * The classic algorithm is relatively easy to understand, but it is
 486  * subject to a number of inefficiencies.  In practice, it is fastest to
 487  * first sort the symbols by frequency.  (This itself can be subject to
 488  * an optimization based on the fact that most frequencies tend to be
 489  * low.)  At the same time, we sort secondarily by symbol value, which
 490  * aids the process of generating a canonical code.  Then, during tree
 491  * construction, no heap is necessary because both the leaf nodes and the
 492  * unparented non-leaf nodes can be easily maintained in sorted order.
 493  * Consequently, there can never be more than two possibilities for the
 494  * next-lowest-frequency node.
 495  *
 496  * In addition, because we're generating a canonical code, we actually
 497  * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
 498  * This is because for canonical code generation we don't need to know
 499  * where the symbols are in the tree.  Rather, we only need to know how
 500  * many leaf nodes have each depth (codeword length).  And this
 501  * information can, in fact, be quickly generated from the tree of
 502  * non-leaves only.
 503  *
 504  * Furthermore, we can build this stripped-down Huffman tree directly in
 505  * the array in which the codewords are to be generated, provided that
 506  * these array slots are large enough to hold a symbol and frequency
 507  * value.
 508  *
 509  * Still furthermore, we don't even need to maintain explicit child
 510  * pointers.  We only need the parent pointers, and even those can be
 511  * overwritten in-place with depth information as part of the process of
 512  * extracting codeword lengths from the tree.  So in summary, we do NOT
 513  * need a big structure like:
 514  *
 515  *      struct huffman_tree_node {
 516  *              unsigned int symbol;
 517  *              unsigned int frequency;
 518  *              unsigned int depth;
 519  *              struct huffman_tree_node *left_child;
 520  *              struct huffman_tree_node *right_child;
 521  *      };
 522  *
 523  *
 524  *   ... which often gets used in "naive" implementations of Huffman code
 525  *   generation.
 526  *
 527  * Most of these optimizations are based on the implementation in 7-Zip
 528  * (source file: C/HuffEnc.c), which has been placed in the public domain
 529  * by Igor Pavlov.  But I've rewritten the code with extensive comments,
 530  * as it took me a while to figure out what it was doing...!
 531  *
 532  * ---------------------------------------------------------------------
 533  *
 534  * NOTE: in general, the same frequencies can be used to generate
 535  * different length-limited canonical Huffman codes.  One choice we have
 536  * is during tree construction, when we must decide whether to prefer a
 537  * leaf or non-leaf when there is a tie in frequency.  Another choice we
 538  * have is how to deal with codewords that would exceed @max_codeword_len
 539  * bits in length.  Both of these choices affect the resulting codeword
 540  * lengths, which otherwise can be mapped uniquely onto the resulting
 541  * canonical Huffman code.
 542  *
 543  * Normally, there is no problem with choosing one valid code over
 544  * another, provided that they produce similar compression ratios.
 545  * However, the LZMS compression format uses adaptive Huffman coding.  It
 546  * requires that both the decompressor and compressor build a canonical
 547  * code equivalent to that which can be generated by using the classic
 548  * Huffman tree construction algorithm and always processing leaves
 549  * before non-leaves when there is a frequency tie.  Therefore, we make
 550  * sure to do this.  This method also has the advantage of sometimes
 551  * shortening the longest codeword that is generated.
 552  *
 553  * There also is the issue of how codewords longer than @max_codeword_len
 554  * are dealt with.  Fortunately, for LZMS this is irrelevant because for
 555  * the LZMS alphabets no codeword can ever exceed LZMS_MAX_CODEWORD_LEN
 556  * (= 15).  Since the LZMS algorithm regularly halves all frequencies,
 557  * the frequencies cannot become high enough for a length 16 codeword to
 558  * be generated.  Specifically, I think that if ties are broken in favor
 559  * of non-leaves (as we do), the lowest total frequency that would give a
 560  * length-16 codeword would be the sum of the frequencies 1 1 1 3 4 7 11
 561  * 18 29 47 76 123 199 322 521 843 1364, which is 3570.  And in LZMS we
 562  * can't get a frequency that high based on the alphabet sizes, rebuild
 563  * frequencies, and scaling factors.  This worst-case scenario is based
 564  * on the following degenerate case (only the bottom of the tree shown):
 565  *
 566  *                          ...
 567  *                        17
 568  *                       /  \
 569  *                      10   7
 570  *                     / \
 571  *                    6   4
 572  *                   / \
 573  *                  3   3
 574  *                 / \
 575  *                2   1
 576  *               / \
 577  *              1   1
 578  *
 579  * Excluding the first leaves (those with value 1), each leaf value must
 580  * be greater than the non-leaf up 1 and down 2 from it; otherwise that
 581  * leaf would have taken precedence over that non-leaf and been combined
 582  * with the leaf below, thereby decreasing the height compared to that
 583  * shown.
 584  *
 585  * Interesting fact: if we were to instead prioritize non-leaves over
 586  * leaves, then the worst case frequencies would be the Fibonacci
 587  * sequence, plus an extra frequency of 1.  In this hypothetical
 588  * scenario, it would be slightly easier for longer codewords to be
 589  * generated.
 590  */
 591 void
 592 make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
 593                             const u32 freqs[restrict],
 594                             u8 lens[restrict], u32 codewords[restrict])
 595 {
 596         u32 *A = codewords;
 597         unsigned num_used_syms;
 598
 599         /* We begin by sorting the symbols primarily by frequency and
 600          * secondarily by symbol value.  As an optimization, the array
 601          * used for this purpose ('A') shares storage with the space in
 602          * which we will eventually return the codewords.  */
 603
 604         num_used_syms = sort_symbols(num_syms, freqs, lens, A);
 605
 606         /* 'num_used_syms' is the number of symbols with nonzero
 607          * frequency.  This may be less than @num_syms.  'num_used_syms'
 608          * is also the number of entries in 'A' that are valid.  Each
 609          * entry consists of a distinct symbol and a nonzero frequency
 610          * packed into a 32-bit integer.  */
 611
 612         /* Handle special cases where only 0 or 1 symbols were used (had
 613          * nonzero frequency).  */
 614
 615         if (unlikely(num_used_syms == 0)) {
 616                 /* Code is empty.  sort_symbols() already set all lengths
 617                  * to 0, so there is nothing more to do.  */
 618                 return;
 619         }
 620
 621         if (unlikely(num_used_syms == 1)) {
 622                 /* Only one symbol was used, so we only need one
 623                  * codeword.  But two codewords are needed to form the
 624                  * smallest complete Huffman code, which uses codewords 0
 625                  * and 1.  Therefore, we choose another symbol to which
 626                  * to assign a codeword.  We use 0 (if the used symbol is
 627                  * not 0) or 1 (if the used symbol is 0).  In either
 628                  * case, the lesser-valued symbol must be assigned
 629                  * codeword 0 so that the resulting code is canonical.  */
 630
 631                 unsigned sym = A[0] & SYMBOL_MASK;
 632                 unsigned nonzero_idx = sym ? sym : 1;
 633
 634                 codewords[0] = 0;
 635                 lens[0] = 1;
 636                 codewords[nonzero_idx] = 1;
 637                 lens[nonzero_idx] = 1;
 638                 return;
 639         }
 640
 641         /* Build a stripped-down version of the Huffman tree, sharing the
 642          * array 'A' with the symbol values.  Then extract length counts
 643          * from the tree and use them to generate the final codewords.  */
 644
 645         build_tree(A, num_used_syms);
 646
 647         {
 648                 unsigned len_counts[max_codeword_len + 1];
 649
 650                 compute_length_counts(A, num_used_syms - 2,
 651                                       len_counts, max_codeword_len);
 652
 653                 gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
 654         }
 655 }