4 * Code for compression shared among multiple compression formats.
6 * Copyright 2022 Eric Biggers
8 * Permission is hereby granted, free of charge, to any person
9 * obtaining a copy of this software and associated documentation
10 * files (the "Software"), to deal in the Software without
11 * restriction, including without limitation the rights to use,
12 * copy, modify, merge, publish, distribute, sublicense, and/or sell
13 * copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following
17 * The above copyright notice and this permission notice shall be
18 * included in all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 * OTHER DEALINGS IN THE SOFTWARE.
36 #include "wimlib/assert.h"
37 #include "wimlib/compress_common.h"
38 #include "wimlib/util.h"
41 * Given the binary tree node A[subtree_idx] whose children already satisfy the
42 * maxheap property, swap the node with its greater child until it is greater
43 * than or equal to both of its children, so that the maxheap property is
44 * satisfied in the subtree rooted at A[subtree_idx]. 'A' uses 1-based indices.
47 heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
54 parent_idx = subtree_idx;
55 while ((child_idx = parent_idx * 2) <= length) {
56 if (child_idx < length && A[child_idx + 1] > A[child_idx])
58 if (v >= A[child_idx])
60 A[parent_idx] = A[child_idx];
61 parent_idx = child_idx;
67 * Rearrange the array 'A' so that it satisfies the maxheap property.
68 * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
71 heapify_array(u32 A[], unsigned length)
75 for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
76 heapify_subtree(A, length, subtree_idx);
80 * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
82 * Note: name this function heap_sort() instead of heapsort() to avoid colliding
83 * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
84 * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
87 heap_sort(u32 A[], unsigned length)
89 A--; /* Use 1-based indices */
91 heapify_array(A, length);
99 heapify_subtree(A, length, 1);
103 #define NUM_SYMBOL_BITS 10
104 #define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS)
105 #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
106 #define FREQ_MASK (~SYMBOL_MASK)
108 #define GET_NUM_COUNTERS(num_syms) (num_syms)
111 * Sort the symbols primarily by frequency and secondarily by symbol value.
112 * Discard symbols with zero frequency and fill in an array with the remaining
113 * symbols, along with their frequencies. The low NUM_SYMBOL_BITS bits of each
114 * array entry will contain the symbol value, and the remaining bits will
115 * contain the frequency.
118 * Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS.
121 * Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1.
124 * An array that eventually will hold the length of each codeword. This
125 * function only fills in the codeword lengths for symbols that have zero
126 * frequency, which are not well defined per se but will be set to 0.
129 * The output array, described above.
131 * Returns the number of entries in 'symout' that were filled. This is the
132 * number of symbols that have nonzero frequency.
135 sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[])
139 unsigned num_used_syms;
140 unsigned num_counters;
141 unsigned counters[GET_NUM_COUNTERS(MAX_NUM_SYMS)];
144 * We use heapsort, but with an added optimization. Since often most
145 * symbol frequencies are low, we first do a count sort using a limited
146 * number of counters. High frequencies are counted in the last
147 * counter, and only they will be sorted with heapsort.
149 * Note: with more symbols, it is generally beneficial to have more
150 * counters. About 1 counter per symbol seems fastest.
153 num_counters = GET_NUM_COUNTERS(num_syms);
155 memset(counters, 0, num_counters * sizeof(counters[0]));
157 /* Count the frequencies. */
158 for (sym = 0; sym < num_syms; sym++)
159 counters[MIN(freqs[sym], num_counters - 1)]++;
162 * Make the counters cumulative, ignoring the zero-th, which counted
163 * symbols with zero frequency. As a side effect, this calculates the
164 * number of symbols with nonzero frequency.
167 for (i = 1; i < num_counters; i++) {
168 unsigned count = counters[i];
170 counters[i] = num_used_syms;
171 num_used_syms += count;
175 * Sort nonzero-frequency symbols using the counters. At the same time,
176 * set the codeword lengths of zero-frequency symbols to 0.
178 for (sym = 0; sym < num_syms; sym++) {
179 u32 freq = freqs[sym];
182 symout[counters[MIN(freq, num_counters - 1)]++] =
183 sym | (freq << NUM_SYMBOL_BITS);
189 /* Sort the symbols counted in the last counter. */
190 heap_sort(symout + counters[num_counters - 2],
191 counters[num_counters - 1] - counters[num_counters - 2]);
193 return num_used_syms;
197 * Build a Huffman tree.
199 * This is an optimized implementation that
200 * (a) takes advantage of the frequencies being already sorted;
201 * (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
202 * tree are sufficient to generate a canonical code;
203 * (c) Only stores parent pointers, not child pointers;
204 * (d) Produces the nodes in the same memory used for input frequency
207 * Array 'A', which contains 'sym_count' entries, is used for both input and
208 * output. For this function, 'sym_count' must be at least 2.
210 * For input, the array must contain the frequencies of the symbols, sorted in
211 * increasing order. Specifically, each entry must contain a frequency left
212 * shifted by NUM_SYMBOL_BITS bits. Any data in the low NUM_SYMBOL_BITS bits of
213 * the entries will be ignored by this function. Although these bits will, in
214 * fact, contain the symbols that correspond to the frequencies, this function
215 * is concerned with frequencies only and keeps the symbols as-is.
217 * For output, this function will produce the non-leaf nodes of the Huffman
218 * tree. These nodes will be stored in the first (sym_count - 1) entries of the
219 * array. Entry A[sym_count - 2] will represent the root node. Each other node
220 * will contain the zero-based index of its parent node in 'A', left shifted by
221 * NUM_SYMBOL_BITS bits. The low NUM_SYMBOL_BITS bits of each entry in A will
222 * be kept as-is. Again, note that although these low bits will, in fact,
223 * contain a symbol value, this symbol will have *no relationship* with the
224 * Huffman tree node that happens to occupy the same slot. This is because this
225 * implementation only generates the non-leaf nodes of the tree.
228 build_tree(u32 A[], unsigned sym_count)
230 const unsigned last_idx = sym_count - 1;
232 /* Index of the next lowest frequency leaf that still needs a parent */
236 * Index of the next lowest frequency non-leaf that still needs a
237 * parent, or 'e' if there is currently no such node
241 /* Index of the next spot for a non-leaf (will overwrite a leaf) */
248 * Select the next two lowest frequency nodes among the leaves
249 * A[i] and non-leaves A[b], and create a new node A[e] to be
250 * their parent. Set the new node's frequency to the sum of the
251 * frequencies of its two children.
253 * Usually the next two lowest frequency nodes are of the same
254 * type (leaf or non-leaf), so check those cases first.
256 if (i + 1 <= last_idx &&
257 (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) {
259 new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK);
261 } else if (b + 2 <= e &&
263 (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) {
265 new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK);
266 A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
267 A[b + 1] = (e << NUM_SYMBOL_BITS) |
268 (A[b + 1] & SYMBOL_MASK);
271 /* One leaf and one non-leaf */
272 new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK);
273 A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
277 A[e] = new_freq | (A[e] & SYMBOL_MASK);
279 * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the
280 * tree is complete once we've created 'n - 1' non-leaves.
282 } while (++e < last_idx);
286 * Given the stripped-down Huffman tree constructed by build_tree(), determine
287 * the number of codewords that should be assigned each possible length, taking
288 * into account the length-limited constraint.
291 * The array produced by build_tree(), containing parent index information
292 * for the non-leaf nodes of the Huffman tree. Each entry in this array is
293 * a node; a node's parent always has a greater index than that node
294 * itself. This function will overwrite the parent index information in
295 * this array, so essentially it will destroy the tree. However, the data
296 * in the low NUM_SYMBOL_BITS of each entry will be preserved.
299 * The 0-based index of the root node in 'A', and consequently one less
300 * than the number of tree node entries in 'A'. (Or, really 2 less than
301 * the actual length of 'A'.)
304 * An array of length ('max_codeword_len' + 1) in which the number of
305 * codewords having each length <= max_codeword_len will be returned.
308 * The maximum permissible codeword length.
311 compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
312 unsigned max_codeword_len)
318 * The key observations are:
320 * (1) We can traverse the non-leaf nodes of the tree, always visiting a
321 * parent before its children, by simply iterating through the array
322 * in reverse order. Consequently, we can compute the depth of each
323 * node in one pass, overwriting the parent indices with depths.
325 * (2) We can initially assume that in the real Huffman tree, both
326 * children of the root are leaves. This corresponds to two
327 * codewords of length 1. Then, whenever we visit a (non-leaf) node
328 * during the traversal, we modify this assumption to account for
329 * the current node *not* being a leaf, but rather its two children
330 * being leaves. This causes the loss of one codeword for the
331 * current depth and the addition of two codewords for the current
334 * (3) We can handle the length-limited constraint fairly easily by
335 * simply using the largest length available when a depth exceeds
339 for (len = 0; len <= max_codeword_len; len++)
343 /* Set the root node's depth to 0. */
344 A[root_idx] &= SYMBOL_MASK;
346 for (node = root_idx - 1; node >= 0; node--) {
348 /* Calculate the depth of this node. */
350 unsigned parent = A[node] >> NUM_SYMBOL_BITS;
351 unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
352 unsigned depth = parent_depth + 1;
353 unsigned len = depth;
356 * Set the depth of this node so that it is available when its
357 * children (if any) are processed.
359 A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
362 * If needed, decrease the length to meet the length-limited
363 * constraint. This is not the optimal method for generating
364 * length-limited Huffman codes! But it should be good enough.
366 if (len >= max_codeword_len) {
367 len = max_codeword_len;
370 } while (len_counts[len] == 0);
374 * Account for the fact that we have a non-leaf node at the
378 len_counts[len + 1] += 2;
383 * Generate the codewords for a canonical Huffman code.
386 * The output array for codewords. In addition, initially this
387 * array must contain the symbols, sorted primarily by frequency and
388 * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
392 * Output array for codeword lengths.
395 * An array that provides the number of codewords that will have
396 * each possible length <= max_codeword_len.
399 * Maximum length, in bits, of each codeword.
402 * Number of symbols in the alphabet, including symbols with zero
403 * frequency. This is the length of the 'A' and 'len' arrays.
406 gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
407 unsigned max_codeword_len, unsigned num_syms)
409 u32 next_codewords[MAX_CODEWORD_LEN + 1];
415 * Given the number of codewords that will have each length, assign
416 * codeword lengths to symbols. We do this by assigning the lengths in
417 * decreasing order to the symbols sorted primarily by increasing
418 * frequency and secondarily by increasing symbol value.
420 for (i = 0, len = max_codeword_len; len >= 1; len--) {
421 unsigned count = len_counts[len];
424 lens[A[i++] & SYMBOL_MASK] = len;
428 * Generate the codewords themselves. We initialize the
429 * 'next_codewords' array to provide the lexicographically first
430 * codeword of each length, then assign codewords in symbol order. This
431 * produces a canonical code.
433 next_codewords[0] = 0;
434 next_codewords[1] = 0;
435 for (len = 2; len <= max_codeword_len; len++)
436 next_codewords[len] =
437 (next_codewords[len - 1] + len_counts[len - 1]) << 1;
439 for (sym = 0; sym < num_syms; sym++)
440 A[sym] = next_codewords[lens[sym]]++;
444 * ---------------------------------------------------------------------
445 * make_canonical_huffman_code()
446 * ---------------------------------------------------------------------
448 * Given an alphabet and the frequency of each symbol in it, construct a
449 * length-limited canonical Huffman code.
452 * The number of symbols in the alphabet. The symbols are the integers in
453 * the range [0, num_syms - 1]. This parameter must be at least 2 and
454 * must not exceed (1 << NUM_SYMBOL_BITS).
457 * The maximum permissible codeword length.
460 * An array of length @num_syms that gives the frequency of each symbol.
461 * It is valid for some, none, or all of the frequencies to be 0. The sum
462 * of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1.
465 * An array of @num_syms entries in which this function will return the
466 * length, in bits, of the codeword assigned to each symbol. Symbols with
467 * 0 frequency will not have codewords per se, but their entries in this
468 * array will be set to 0. No lengths greater than @max_codeword_len will
472 * An array of @num_syms entries in which this function will return the
473 * codeword for each symbol, right-justified and padded on the left with
474 * zeroes. Codewords for symbols with 0 frequency will be undefined.
476 * ---------------------------------------------------------------------
478 * This function builds a length-limited canonical Huffman code.
480 * A length-limited Huffman code contains no codewords longer than some
481 * specified length, and has exactly (with some algorithms) or approximately
482 * (with the algorithm used here) the minimum weighted path length from the
483 * root, given this constraint.
485 * A canonical Huffman code satisfies the properties that a longer codeword
486 * never lexicographically precedes a shorter codeword, and the lexicographic
487 * ordering of codewords of the same length is the same as the lexicographic
488 * ordering of the corresponding symbols. A canonical Huffman code, or more
489 * generally a canonical prefix code, can be reconstructed from only a list
490 * containing the codeword length of each symbol.
492 * The classic algorithm to generate a Huffman code creates a node for each
493 * symbol, then inserts these nodes into a min-heap keyed by symbol frequency.
494 * Then, repeatedly, the two lowest-frequency nodes are removed from the
495 * min-heap and added as the children of a new node having frequency equal to
496 * the sum of its two children, which is then inserted into the min-heap. When
497 * only a single node remains in the min-heap, it is the root of the Huffman
498 * tree. The codeword for each symbol is determined by the path needed to reach
499 * the corresponding node from the root. Descending to the left child appends a
500 * 0 bit, whereas descending to the right child appends a 1 bit.
502 * The classic algorithm is relatively easy to understand, but it is subject to
503 * a number of inefficiencies. In practice, it is fastest to first sort the
504 * symbols by frequency. (This itself can be subject to an optimization based
505 * on the fact that most frequencies tend to be low.) At the same time, we sort
506 * secondarily by symbol value, which aids the process of generating a canonical
507 * code. Then, during tree construction, no heap is necessary because both the
508 * leaf nodes and the unparented non-leaf nodes can be easily maintained in
509 * sorted order. Consequently, there can never be more than two possibilities
510 * for the next-lowest-frequency node.
512 * In addition, because we're generating a canonical code, we actually don't
513 * need the leaf nodes of the tree at all, only the non-leaf nodes. This is
514 * because for canonical code generation we don't need to know where the symbols
515 * are in the tree. Rather, we only need to know how many leaf nodes have each
516 * depth (codeword length). And this information can, in fact, be quickly
517 * generated from the tree of non-leaves only.
519 * Furthermore, we can build this stripped-down Huffman tree directly in the
520 * array in which the codewords are to be generated, provided that these array
521 * slots are large enough to hold a symbol and frequency value.
523 * Still furthermore, we don't even need to maintain explicit child pointers.
524 * We only need the parent pointers, and even those can be overwritten in-place
525 * with depth information as part of the process of extracting codeword lengths
526 * from the tree. So in summary, we do NOT need a big structure like:
528 * struct huffman_tree_node {
529 * unsigned int symbol;
530 * unsigned int frequency;
531 * unsigned int depth;
532 * struct huffman_tree_node *left_child;
533 * struct huffman_tree_node *right_child;
537 * ... which often gets used in "naive" implementations of Huffman code
540 * Many of these optimizations are based on the implementation in 7-Zip (source
541 * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov.
543 * NOTE: in general, the same frequencies can be used to generate different
544 * length-limited canonical Huffman codes. One choice we have is during tree
545 * construction, when we must decide whether to prefer a leaf or non-leaf when
546 * there is a tie in frequency. Another choice we have is how to deal with
547 * codewords that would exceed @max_codeword_len bits in length. Both of these
548 * choices affect the resulting codeword lengths, which otherwise can be mapped
549 * uniquely onto the resulting canonical Huffman code.
551 * Normally, there is no problem with choosing one valid code over another,
552 * provided that they produce similar compression ratios. However, the LZMS
553 * compression format uses adaptive Huffman coding. It requires that both the
554 * decompressor and compressor build a canonical code equivalent to that which
555 * can be generated by using the classic Huffman tree construction algorithm and
556 * always processing leaves before non-leaves when there is a frequency tie.
557 * Therefore, we make sure to do this. This method also has the advantage of
558 * sometimes shortening the longest codeword that is generated.
560 * There also is the issue of how codewords longer than @max_codeword_len are
561 * dealt with. Fortunately, for LZMS this is irrelevant because for the LZMS
562 * alphabets no codeword can ever exceed LZMS_MAX_CODEWORD_LEN (= 15). Since
563 * the LZMS algorithm regularly halves all frequencies, the frequencies cannot
564 * become high enough for a length 16 codeword to be generated. Specifically, I
565 * think that if ties are broken in favor of non-leaves (as we do), the lowest
566 * total frequency that would give a length-16 codeword would be the sum of the
567 * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which is
568 * 3570. And in LZMS we can't get a frequency that high based on the alphabet
569 * sizes, rebuild frequencies, and scaling factors. This worst-case scenario is
570 * based on the following degenerate case (only the bottom of the tree shown):
585 * Excluding the first leaves (those with value 1), each leaf value must be
586 * greater than the non-leaf up 1 and down 2 from it; otherwise that leaf would
587 * have taken precedence over that non-leaf and been combined with the leaf
588 * below, thereby decreasing the height compared to that shown.
590 * Interesting fact: if we were to instead prioritize non-leaves over leaves,
591 * then the worst case frequencies would be the Fibonacci sequence, plus an
592 * extra frequency of 1. In this hypothetical scenario, it would be slightly
593 * easier for longer codewords to be generated.
596 make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
597 const u32 freqs[], u8 lens[], u32 codewords[])
600 unsigned num_used_syms;
602 wimlib_assert(num_syms <= MAX_NUM_SYMS);
603 STATIC_ASSERT(MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
604 wimlib_assert(max_codeword_len <= MAX_CODEWORD_LEN);
607 * We begin by sorting the symbols primarily by frequency and
608 * secondarily by symbol value. As an optimization, the array used for
609 * this purpose ('A') shares storage with the space in which we will
610 * eventually return the codewords.
612 num_used_syms = sort_symbols(num_syms, freqs, lens, A);
615 * 'num_used_syms' is the number of symbols with nonzero frequency.
616 * This may be less than @num_syms. 'num_used_syms' is also the number
617 * of entries in 'A' that are valid. Each entry consists of a distinct
618 * symbol and a nonzero frequency packed into a 32-bit integer.
622 * Handle special cases where only 0 or 1 symbols were used (had nonzero
626 if (unlikely(num_used_syms == 0)) {
628 * Code is empty. sort_symbols() already set all lengths to 0,
629 * so there is nothing more to do.
634 if (unlikely(num_used_syms == 1)) {
636 * Only one symbol was used, so we only need one codeword. But
637 * two codewords are needed to form the smallest complete
638 * Huffman code, which uses codewords 0 and 1. Therefore, we
639 * choose another symbol to which to assign a codeword. We use
640 * 0 (if the used symbol is not 0) or 1 (if the used symbol is
641 * 0). In either case, the lesser-valued symbol must be
642 * assigned codeword 0 so that the resulting code is canonical.
645 unsigned sym = A[0] & SYMBOL_MASK;
646 unsigned nonzero_idx = sym ? sym : 1;
650 codewords[nonzero_idx] = 1;
651 lens[nonzero_idx] = 1;
656 * Build a stripped-down version of the Huffman tree, sharing the array
657 * 'A' with the symbol values. Then extract length counts from the tree
658 * and use them to generate the final codewords.
661 build_tree(A, num_used_syms);
664 unsigned len_counts[MAX_CODEWORD_LEN + 1];
666 compute_length_counts(A, num_used_syms - 2,
667 len_counts, max_codeword_len);
669 gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);