#include <string.h>
+#include "wimlib/assert.h"
#include "wimlib/compress_common.h"
#include "wimlib/util.h"
-/* Given the binary tree node A[subtree_idx] whose children already
- * satisfy the maxheap property, swap the node with its greater child
- * until it is greater than both its children, so that the maxheap
- * property is satisfied in the subtree rooted at A[subtree_idx]. */
+/*
+ * Given the binary tree node A[subtree_idx] whose children already satisfy the
+ * maxheap property, swap the node with its greater child until it is greater
+ * than or equal to both of its children, so that the maxheap property is
+ * satisfied in the subtree rooted at A[subtree_idx]. 'A' uses 1-based indices.
+ */
static void
heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
{
A[parent_idx] = v;
}
-/* Rearrange the array 'A' so that it satisfies the maxheap property.
+/*
+ * Rearrange the array 'A' so that it satisfies the maxheap property.
* 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
*/
static void
heapify_array(u32 A[], unsigned length)
{
- for (unsigned subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
+ unsigned subtree_idx;
+
+ for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
heapify_subtree(A, length, subtree_idx);
}
-/* Sort the array 'A', which contains 'length' unsigned 32-bit integers. */
+/*
+ * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
+ *
+ * Note: name this function heap_sort() instead of heapsort() to avoid colliding
+ * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
+ * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
+ */
static void
-heapsort(u32 A[], unsigned length)
+heap_sort(u32 A[], unsigned length)
{
A--; /* Use 1-based indices */
heapify_array(A, length);
while (length >= 2) {
- swap(A[1], A[length]);
+ u32 tmp = A[length];
+
+ A[length] = A[1];
+ A[1] = tmp;
length--;
heapify_subtree(A, length, 1);
}
}
#define NUM_SYMBOL_BITS 10
-#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
+#define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS)
+#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
+#define FREQ_MASK (~SYMBOL_MASK)
+
+#define GET_NUM_COUNTERS(num_syms) (num_syms)
/*
- * Sort the symbols primarily by frequency and secondarily by symbol
- * value. Discard symbols with zero frequency and fill in an array with
- * the remaining symbols, along with their frequencies. The low
- * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
- * value, and the remaining bits will contain the frequency.
+ * Sort the symbols primarily by frequency and secondarily by symbol value.
+ * Discard symbols with zero frequency and fill in an array with the remaining
+ * symbols, along with their frequencies. The low NUM_SYMBOL_BITS bits of each
+ * array entry will contain the symbol value, and the remaining bits will
+ * contain the frequency.
*
* @num_syms
- * Number of symbols in the alphabet.
- * Can't be greater than (1 << NUM_SYMBOL_BITS).
+ * Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS.
*
* @freqs[num_syms]
- * The frequency of each symbol.
+ * Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1.
*
* @lens[num_syms]
- * An array that eventually will hold the length of each codeword.
- * This function only fills in the codeword lengths for symbols that
- * have zero frequency, which are not well defined per se but will
- * be set to 0.
+ * An array that eventually will hold the length of each codeword. This
+ * function only fills in the codeword lengths for symbols that have zero
+ * frequency, which are not well defined per se but will be set to 0.
*
* @symout[num_syms]
* The output array, described above.
*
- * Returns the number of entries in 'symout' that were filled. This is
- * the number of symbols that have nonzero frequency.
+ * Returns the number of entries in 'symout' that were filled. This is the
+ * number of symbols that have nonzero frequency.
*/
static unsigned
-sort_symbols(unsigned num_syms, const u32 freqs[restrict],
- u8 lens[restrict], u32 symout[restrict])
+sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[])
{
+ unsigned sym;
+ unsigned i;
unsigned num_used_syms;
unsigned num_counters;
+ unsigned counters[GET_NUM_COUNTERS(MAX_NUM_SYMS)];
- /* We rely on heapsort, but with an added optimization. Since
- * it's common for most symbol frequencies to be low, we first do
- * a count sort using a limited number of counters. High
- * frequencies will be counted in the last counter, and only they
- * will be sorted with heapsort.
+ /*
+ * We use heapsort, but with an added optimization. Since often most
+ * symbol frequencies are low, we first do a count sort using a limited
+ * number of counters. High frequencies are counted in the last
+ * counter, and only they will be sorted with heapsort.
*
* Note: with more symbols, it is generally beneficial to have more
- * counters. About 1 counter per 4 symbols seems fast.
- *
- * Note: I also tested radix sort, but even for large symbol
- * counts (> 255) and frequencies bounded at 16 bits (enabling
- * radix sort by just two base-256 digits), it didn't seem any
- * faster than the method implemented here.
- *
- * Note: I tested the optimized quicksort implementation from
- * glibc (with indirection overhead removed), but it was only
- * marginally faster than the simple heapsort implemented here.
- *
- * Tests were done with building the codes for LZX. Results may
- * vary for different compression algorithms...! */
-
- num_counters = ALIGN(DIV_ROUND_UP(num_syms, 4), 4);
+ * counters. About 1 counter per symbol seems fastest.
+ */
- unsigned counters[num_counters];
+ num_counters = GET_NUM_COUNTERS(num_syms);
- memset(counters, 0, sizeof(counters));
+ memset(counters, 0, num_counters * sizeof(counters[0]));
- /* Count the frequencies. */
- for (unsigned sym = 0; sym < num_syms; sym++)
- counters[min(freqs[sym], num_counters - 1)]++;
+ /* Count the frequencies. */
+ for (sym = 0; sym < num_syms; sym++)
+ counters[MIN(freqs[sym], num_counters - 1)]++;
- /* Make the counters cumulative, ignoring the zero-th, which
- * counted symbols with zero frequency. As a side effect, this
- * calculates the number of symbols with nonzero frequency. */
+ /*
+ * Make the counters cumulative, ignoring the zero-th, which counted
+ * symbols with zero frequency. As a side effect, this calculates the
+ * number of symbols with nonzero frequency.
+ */
num_used_syms = 0;
- for (unsigned i = 1; i < num_counters; i++) {
+ for (i = 1; i < num_counters; i++) {
unsigned count = counters[i];
+
counters[i] = num_used_syms;
num_used_syms += count;
}
- /* Sort nonzero-frequency symbols using the counters. At the
- * same time, set the codeword lengths of zero-frequency symbols
- * to 0. */
- for (unsigned sym = 0; sym < num_syms; sym++) {
+ /*
+ * Sort nonzero-frequency symbols using the counters. At the same time,
+ * set the codeword lengths of zero-frequency symbols to 0.
+ */
+ for (sym = 0; sym < num_syms; sym++) {
u32 freq = freqs[sym];
+
if (freq != 0) {
- symout[counters[min(freq, num_counters - 1)]++] =
+ symout[counters[MIN(freq, num_counters - 1)]++] =
sym | (freq << NUM_SYMBOL_BITS);
} else {
lens[sym] = 0;
}
}
- /* Sort the symbols counted in the last counter. */
- heapsort(symout + counters[num_counters - 2],
- counters[num_counters - 1] - counters[num_counters - 2]);
+ /* Sort the symbols counted in the last counter. */
+ heap_sort(symout + counters[num_counters - 2],
+ counters[num_counters - 1] - counters[num_counters - 2]);
return num_used_syms;
}
/*
- * Build the Huffman tree.
+ * Build a Huffman tree.
*
* This is an optimized implementation that
* (a) takes advantage of the frequencies being already sorted;
- * (b) only generates non-leaf nodes, since the non-leaf nodes of a
- * Huffman tree are sufficient to generate a canonical code;
+ * (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
+ * tree are sufficient to generate a canonical code;
* (c) Only stores parent pointers, not child pointers;
- * (d) Produces the nodes in the same memory used for input
- * frequency information.
- *
- * Array 'A', which contains 'sym_count' entries, is used for both input
- * and output. For this function, 'sym_count' must be at least 2.
- *
- * For input, the array must contain the frequencies of the symbols,
- * sorted in increasing order. Specifically, each entry must contain a
- * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low
- * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
- * Although these bits will, in fact, contain the symbols that correspond
- * to the frequencies, this function is concerned with frequencies only
- * and keeps the symbols as-is.
- *
- * For output, this function will produce the non-leaf nodes of the
- * Huffman tree. These nodes will be stored in the first (sym_count - 1)
- * entries of the array. Entry A[sym_count - 2] will represent the root
- * node. Each other node will contain the zero-based index of its parent
- * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low
- * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again,
- * note that although these low bits will, in fact, contain a symbol
- * value, this symbol will have *no relationship* with the Huffman tree
- * node that happens to occupy the same slot. This is because this
+ * (d) Produces the nodes in the same memory used for input frequency
+ * information.
+ *
+ * Array 'A', which contains 'sym_count' entries, is used for both input and
+ * output. For this function, 'sym_count' must be at least 2.
+ *
+ * For input, the array must contain the frequencies of the symbols, sorted in
+ * increasing order. Specifically, each entry must contain a frequency left
+ * shifted by NUM_SYMBOL_BITS bits. Any data in the low NUM_SYMBOL_BITS bits of
+ * the entries will be ignored by this function. Although these bits will, in
+ * fact, contain the symbols that correspond to the frequencies, this function
+ * is concerned with frequencies only and keeps the symbols as-is.
+ *
+ * For output, this function will produce the non-leaf nodes of the Huffman
+ * tree. These nodes will be stored in the first (sym_count - 1) entries of the
+ * array. Entry A[sym_count - 2] will represent the root node. Each other node
+ * will contain the zero-based index of its parent node in 'A', left shifted by
+ * NUM_SYMBOL_BITS bits. The low NUM_SYMBOL_BITS bits of each entry in A will
+ * be kept as-is. Again, note that although these low bits will, in fact,
+ * contain a symbol value, this symbol will have *no relationship* with the
+ * Huffman tree node that happens to occupy the same slot. This is because this
* implementation only generates the non-leaf nodes of the tree.
*/
static void
build_tree(u32 A[], unsigned sym_count)
{
- /* Index, in 'A', of next lowest frequency symbol that has not
- * yet been processed. */
+ const unsigned last_idx = sym_count - 1;
+
+ /* Index of the next lowest frequency leaf that still needs a parent */
unsigned i = 0;
- /* Index, in 'A', of next lowest frequency parentless non-leaf
- * node; or, if equal to 'e', then no such node exists yet. */
+ /*
+ * Index of the next lowest frequency non-leaf that still needs a
+ * parent, or 'e' if there is currently no such node
+ */
unsigned b = 0;
- /* Index, in 'A', of next node to allocate as a non-leaf. */
+ /* Index of the next spot for a non-leaf (will overwrite a leaf) */
unsigned e = 0;
do {
- unsigned m, n;
- u32 freq_shifted;
+ u32 new_freq;
- /* Choose the two next lowest frequency entries. */
-
- if (i != sym_count &&
- (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
- m = i++;
- else
- m = b++;
-
- if (i != sym_count &&
- (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
- n = i++;
- else
- n = b++;
-
- /* Allocate a non-leaf node and link the entries to it.
+ /*
+ * Select the next two lowest frequency nodes among the leaves
+ * A[i] and non-leaves A[b], and create a new node A[e] to be
+ * their parent. Set the new node's frequency to the sum of the
+ * frequencies of its two children.
*
- * If we link an entry that we're visiting for the first
- * time (via index 'i'), then we're actually linking a
- * leaf node and it will have no effect, since the leaf
- * will be overwritten with a non-leaf when index 'e'
- * catches up to it. But it's not any slower to
- * unconditionally set the parent index.
- *
- * We also compute the frequency of the non-leaf node as
- * the sum of its two children's frequencies. */
-
- freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
-
- A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
- A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
- A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
- e++;
- } while (sym_count - e > 1);
- /* When just one entry remains, it is a "leaf" that was
- * linked to some other node. We ignore it, since the
- * rest of the array contains the non-leaves which we
- * need. (Note that we're assuming the cases with 0 or 1
- * symbols were handled separately.) */
+ * Usually the next two lowest frequency nodes are of the same
+ * type (leaf or non-leaf), so check those cases first.
+ */
+ if (i + 1 <= last_idx &&
+ (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) {
+ /* Two leaves */
+ new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK);
+ i += 2;
+ } else if (b + 2 <= e &&
+ (i > last_idx ||
+ (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) {
+ /* Two non-leaves */
+ new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK);
+ A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+ A[b + 1] = (e << NUM_SYMBOL_BITS) |
+ (A[b + 1] & SYMBOL_MASK);
+ b += 2;
+ } else {
+ /* One leaf and one non-leaf */
+ new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK);
+ A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+ i++;
+ b++;
+ }
+ A[e] = new_freq | (A[e] & SYMBOL_MASK);
+ /*
+ * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the
+ * tree is complete once we've created 'n - 1' non-leaves.
+ */
+ } while (++e < last_idx);
}
/*
- * Given the stripped-down Huffman tree constructed by build_tree(),
- * determine the number of codewords that should be assigned each
- * possible length, taking into account the length-limited constraint.
+ * Given the stripped-down Huffman tree constructed by build_tree(), determine
+ * the number of codewords that should be assigned each possible length, taking
+ * into account the length-limited constraint.
*
* @A
- * The array produced by build_tree(), containing parent index
- * information for the non-leaf nodes of the Huffman tree. Each
- * entry in this array is a node; a node's parent always has a
- * greater index than that node itself. This function will
- * overwrite the parent index information in this array, so
- * essentially it will destroy the tree. However, the data in the
- * low NUM_SYMBOL_BITS of each entry will be preserved.
+ * The array produced by build_tree(), containing parent index information
+ * for the non-leaf nodes of the Huffman tree. Each entry in this array is
+ * a node; a node's parent always has a greater index than that node
+ * itself. This function will overwrite the parent index information in
+ * this array, so essentially it will destroy the tree. However, the data
+ * in the low NUM_SYMBOL_BITS of each entry will be preserved.
*
* @root_idx
- * The 0-based index of the root node in 'A', and consequently one
- * less than the number of tree node entries in 'A'. (Or, really 2
- * less than the actual length of 'A'.)
+ * The 0-based index of the root node in 'A', and consequently one less
+ * than the number of tree node entries in 'A'. (Or, really 2 less than
+ * the actual length of 'A'.)
*
* @len_counts
* An array of length ('max_codeword_len' + 1) in which the number of
- * codewords having each length <= max_codeword_len will be
- * returned.
+ * codewords having each length <= max_codeword_len will be returned.
*
* @max_codeword_len
* The maximum permissible codeword length.
*/
static void
-compute_length_counts(u32 A[restrict], unsigned root_idx,
- unsigned len_counts[restrict], unsigned max_codeword_len)
+compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
+ unsigned max_codeword_len)
{
- /* The key observations are:
+ unsigned len;
+ int node;
+
+ /*
+ * The key observations are:
*
- * (1) We can traverse the non-leaf nodes of the tree, always
- * visiting a parent before its children, by simply iterating
- * through the array in reverse order. Consequently, we can
- * compute the depth of each node in one pass, overwriting the
- * parent indices with depths.
+ * (1) We can traverse the non-leaf nodes of the tree, always visiting a
+ * parent before its children, by simply iterating through the array
+ * in reverse order. Consequently, we can compute the depth of each
+ * node in one pass, overwriting the parent indices with depths.
*
- * (2) We can initially assume that in the real Huffman tree,
- * both children of the root are leaves. This corresponds to two
- * codewords of length 1. Then, whenever we visit a (non-leaf)
- * node during the traversal, we modify this assumption to
- * account for the current node *not* being a leaf, but rather
- * its two children being leaves. This causes the loss of one
- * codeword for the current depth and the addition of two
- * codewords for the current depth plus one.
+ * (2) We can initially assume that in the real Huffman tree, both
+ * children of the root are leaves. This corresponds to two
+ * codewords of length 1. Then, whenever we visit a (non-leaf) node
+ * during the traversal, we modify this assumption to account for
+ * the current node *not* being a leaf, but rather its two children
+ * being leaves. This causes the loss of one codeword for the
+ * current depth and the addition of two codewords for the current
+ * depth plus one.
*
- * (3) We can handle the length-limited constraint fairly easily
- * by simply using the largest length available when a depth
- * exceeds max_codeword_len.
+ * (3) We can handle the length-limited constraint fairly easily by
+ * simply using the largest length available when a depth exceeds
+ * max_codeword_len.
*/
- for (unsigned len = 0; len <= max_codeword_len; len++)
+ for (len = 0; len <= max_codeword_len; len++)
len_counts[len] = 0;
len_counts[1] = 2;
- /* Set the root node's depth to 0. */
+ /* Set the root node's depth to 0. */
A[root_idx] &= SYMBOL_MASK;
- for (int node = root_idx - 1; node >= 0; node--) {
+ for (node = root_idx - 1; node >= 0; node--) {
- /* Calculate the depth of this node. */
+ /* Calculate the depth of this node. */
unsigned parent = A[node] >> NUM_SYMBOL_BITS;
unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
unsigned depth = parent_depth + 1;
unsigned len = depth;
- /* Set the depth of this node so that it is available
- * when its children (if any) are processed. */
-
+ /*
+ * Set the depth of this node so that it is available when its
+ * children (if any) are processed.
+ */
A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
- /* If needed, decrease the length to meet the
- * length-limited constraint. This is not the optimal
- * method for generating length-limited Huffman codes!
- * But it should be good enough. */
+ /*
+ * If needed, decrease the length to meet the length-limited
+ * constraint. This is not the optimal method for generating
+ * length-limited Huffman codes! But it should be good enough.
+ */
if (len >= max_codeword_len) {
len = max_codeword_len;
do {
} while (len_counts[len] == 0);
}
- /* Account for the fact that we have a non-leaf node at
- * the current depth. */
+ /*
+ * Account for the fact that we have a non-leaf node at the
+ * current depth.
+ */
len_counts[len]--;
len_counts[len + 1] += 2;
}
* frequency. This is the length of the 'A' and 'len' arrays.
*/
static void
-gen_codewords(u32 A[restrict], u8 lens[restrict],
- const unsigned len_counts[restrict],
+gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
unsigned max_codeword_len, unsigned num_syms)
{
- u32 next_codewords[max_codeword_len + 1];
-
- /* Given the number of codewords that will have each length,
- * assign codeword lengths to symbols. We do this by assigning
- * the lengths in decreasing order to the symbols sorted
- * primarily by increasing frequency and secondarily by
- * increasing symbol value. */
- for (unsigned i = 0, len = max_codeword_len; len >= 1; len--) {
+ u32 next_codewords[MAX_CODEWORD_LEN + 1];
+ unsigned i;
+ unsigned len;
+ unsigned sym;
+
+ /*
+ * Given the number of codewords that will have each length, assign
+ * codeword lengths to symbols. We do this by assigning the lengths in
+ * decreasing order to the symbols sorted primarily by increasing
+ * frequency and secondarily by increasing symbol value.
+ */
+ for (i = 0, len = max_codeword_len; len >= 1; len--) {
unsigned count = len_counts[len];
+
while (count--)
lens[A[i++] & SYMBOL_MASK] = len;
}
- /* Generate the codewords themselves. We initialize the
+ /*
+ * Generate the codewords themselves. We initialize the
* 'next_codewords' array to provide the lexicographically first
- * codeword of each length, then assign codewords in symbol
- * order. This produces a canonical code. */
+ * codeword of each length, then assign codewords in symbol order. This
+ * produces a canonical code.
+ */
next_codewords[0] = 0;
next_codewords[1] = 0;
- for (unsigned len = 2; len <= max_codeword_len; len++)
+ for (len = 2; len <= max_codeword_len; len++)
next_codewords[len] =
(next_codewords[len - 1] + len_counts[len - 1]) << 1;
- for (unsigned sym = 0; sym < num_syms; sym++)
+ for (sym = 0; sym < num_syms; sym++)
A[sym] = next_codewords[lens[sym]]++;
}
* length-limited canonical Huffman code.
*
* @num_syms
- * The number of symbols in the alphabet. The symbols are the
- * integers in the range [0, num_syms - 1]. This parameter must be
- * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
+ * The number of symbols in the alphabet. The symbols are the integers in
+ * the range [0, num_syms - 1]. This parameter must be at least 2 and
+ * must not exceed (1 << NUM_SYMBOL_BITS).
*
* @max_codeword_len
* The maximum permissible codeword length.
*
* @freqs
- * An array of @num_syms entries, each of which specifies the
- * frequency of the corresponding symbol. It is valid for some,
- * none, or all of the frequencies to be 0.
+ * An array of length @num_syms that gives the frequency of each symbol.
+ * It is valid for some, none, or all of the frequencies to be 0. The sum
+ * of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1.
*
* @lens
- * An array of @num_syms entries in which this function will return
- * the length, in bits, of the codeword assigned to each symbol.
- * Symbols with 0 frequency will not have codewords per se, but
- * their entries in this array will be set to 0. No lengths greater
- * than @max_codeword_len will be assigned.
+ * An array of @num_syms entries in which this function will return the
+ * length, in bits, of the codeword assigned to each symbol. Symbols with
+ * 0 frequency will not have codewords per se, but their entries in this
+ * array will be set to 0. No lengths greater than @max_codeword_len will
+ * be assigned.
*
* @codewords
- * An array of @num_syms entries in which this function will return
- * the codeword for each symbol, right-justified and padded on the
- * left with zeroes. Codewords for symbols with 0 frequency will be
- * undefined.
+ * An array of @num_syms entries in which this function will return the
+ * codeword for each symbol, right-justified and padded on the left with
+ * zeroes. Codewords for symbols with 0 frequency will be undefined.
*
* ---------------------------------------------------------------------
*
* This function builds a length-limited canonical Huffman code.
*
* A length-limited Huffman code contains no codewords longer than some
- * specified length, and has exactly (with some algorithms) or
- * approximately (with the algorithm used here) the minimum weighted path
- * length from the root, given this constraint.
- *
- * A canonical Huffman code satisfies the properties that a longer
- * codeword never lexicographically precedes a shorter codeword, and the
- * lexicographic ordering of codewords of the same length is the same as
- * the lexicographic ordering of the corresponding symbols. A canonical
- * Huffman code, or more generally a canonical prefix code, can be
- * reconstructed from only a list containing the codeword length of each
- * symbol.
- *
- * The classic algorithm to generate a Huffman code creates a node for
- * each symbol, then inserts these nodes into a min-heap keyed by symbol
- * frequency. Then, repeatedly, the two lowest-frequency nodes are
- * removed from the min-heap and added as the children of a new node
- * having frequency equal to the sum of its two children, which is then
- * inserted into the min-heap. When only a single node remains in the
- * min-heap, it is the root of the Huffman tree. The codeword for each
- * symbol is determined by the path needed to reach the corresponding
- * node from the root. Descending to the left child appends a 0 bit,
- * whereas descending to the right child appends a 1 bit.
- *
- * The classic algorithm is relatively easy to understand, but it is
- * subject to a number of inefficiencies. In practice, it is fastest to
- * first sort the symbols by frequency. (This itself can be subject to
- * an optimization based on the fact that most frequencies tend to be
- * low.) At the same time, we sort secondarily by symbol value, which
- * aids the process of generating a canonical code. Then, during tree
- * construction, no heap is necessary because both the leaf nodes and the
- * unparented non-leaf nodes can be easily maintained in sorted order.
- * Consequently, there can never be more than two possibilities for the
- * next-lowest-frequency node.
- *
- * In addition, because we're generating a canonical code, we actually
- * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
- * This is because for canonical code generation we don't need to know
- * where the symbols are in the tree. Rather, we only need to know how
- * many leaf nodes have each depth (codeword length). And this
- * information can, in fact, be quickly generated from the tree of
- * non-leaves only.
- *
- * Furthermore, we can build this stripped-down Huffman tree directly in
- * the array in which the codewords are to be generated, provided that
- * these array slots are large enough to hold a symbol and frequency
- * value.
- *
- * Still furthermore, we don't even need to maintain explicit child
- * pointers. We only need the parent pointers, and even those can be
- * overwritten in-place with depth information as part of the process of
- * extracting codeword lengths from the tree. So in summary, we do NOT
- * need a big structure like:
+ * specified length, and has exactly (with some algorithms) or approximately
+ * (with the algorithm used here) the minimum weighted path length from the
+ * root, given this constraint.
+ *
+ * A canonical Huffman code satisfies the properties that a longer codeword
+ * never lexicographically precedes a shorter codeword, and the lexicographic
+ * ordering of codewords of the same length is the same as the lexicographic
+ * ordering of the corresponding symbols. A canonical Huffman code, or more
+ * generally a canonical prefix code, can be reconstructed from only a list
+ * containing the codeword length of each symbol.
+ *
+ * The classic algorithm to generate a Huffman code creates a node for each
+ * symbol, then inserts these nodes into a min-heap keyed by symbol frequency.
+ * Then, repeatedly, the two lowest-frequency nodes are removed from the
+ * min-heap and added as the children of a new node having frequency equal to
+ * the sum of its two children, which is then inserted into the min-heap. When
+ * only a single node remains in the min-heap, it is the root of the Huffman
+ * tree. The codeword for each symbol is determined by the path needed to reach
+ * the corresponding node from the root. Descending to the left child appends a
+ * 0 bit, whereas descending to the right child appends a 1 bit.
+ *
+ * The classic algorithm is relatively easy to understand, but it is subject to
+ * a number of inefficiencies. In practice, it is fastest to first sort the
+ * symbols by frequency. (This itself can be subject to an optimization based
+ * on the fact that most frequencies tend to be low.) At the same time, we sort
+ * secondarily by symbol value, which aids the process of generating a canonical
+ * code. Then, during tree construction, no heap is necessary because both the
+ * leaf nodes and the unparented non-leaf nodes can be easily maintained in
+ * sorted order. Consequently, there can never be more than two possibilities
+ * for the next-lowest-frequency node.
+ *
+ * In addition, because we're generating a canonical code, we actually don't
+ * need the leaf nodes of the tree at all, only the non-leaf nodes. This is
+ * because for canonical code generation we don't need to know where the symbols
+ * are in the tree. Rather, we only need to know how many leaf nodes have each
+ * depth (codeword length). And this information can, in fact, be quickly
+ * generated from the tree of non-leaves only.
+ *
+ * Furthermore, we can build this stripped-down Huffman tree directly in the
+ * array in which the codewords are to be generated, provided that these array
+ * slots are large enough to hold a symbol and frequency value.
+ *
+ * Still furthermore, we don't even need to maintain explicit child pointers.
+ * We only need the parent pointers, and even those can be overwritten in-place
+ * with depth information as part of the process of extracting codeword lengths
+ * from the tree. So in summary, we do NOT need a big structure like:
*
* struct huffman_tree_node {
* unsigned int symbol;
* };
*
*
- * ... which often gets used in "naive" implementations of Huffman code
- * generation.
- *
- * Most of these optimizations are based on the implementation in 7-Zip
- * (source file: C/HuffEnc.c), which has been placed in the public domain
- * by Igor Pavlov. But I've rewritten the code with extensive comments,
- * as it took me a while to figure out what it was doing...!
- *
- * ---------------------------------------------------------------------
- *
- * NOTE: in general, the same frequencies can be used to generate
- * different length-limited canonical Huffman codes. One choice we have
- * is during tree construction, when we must decide whether to prefer a
- * leaf or non-leaf when there is a tie in frequency. Another choice we
- * have is how to deal with codewords that would exceed @max_codeword_len
- * bits in length. Both of these choices affect the resulting codeword
- * lengths, which otherwise can be mapped uniquely onto the resulting
- * canonical Huffman code.
- *
- * Normally, there is no problem with choosing one valid code over
- * another, provided that they produce similar compression ratios.
- * However, the LZMS compression format uses adaptive Huffman coding. It
- * requires that both the decompressor and compressor build a canonical
- * code equivalent to that which can be generated by using the classic
- * Huffman tree construction algorithm and always processing leaves
- * before non-leaves when there is a frequency tie. Therefore, we make
- * sure to do this. This method also has the advantage of sometimes
- * shortening the longest codeword that is generated.
- *
- * There also is the issue of how codewords longer than @max_codeword_len
- * are dealt with. Fortunately, for LZMS this is irrelevant because for
- * the LZMS alphabets no codeword can ever exceed LZMS_MAX_CODEWORD_LEN
- * (= 15). Since the LZMS algorithm regularly halves all frequencies,
- * the frequencies cannot become high enough for a length 16 codeword to
- * be generated. Specifically, I think that if ties are broken in favor
- * of non-leaves (as we do), the lowest total frequency that would give a
- * length-16 codeword would be the sum of the frequencies 1 1 1 3 4 7 11
- * 18 29 47 76 123 199 322 521 843 1364, which is 3570. And in LZMS we
- * can't get a frequency that high based on the alphabet sizes, rebuild
- * frequencies, and scaling factors. This worst-case scenario is based
- * on the following degenerate case (only the bottom of the tree shown):
+ * ... which often gets used in "naive" implementations of Huffman code
+ * generation.
+ *
+ * Many of these optimizations are based on the implementation in 7-Zip (source
+ * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov.
+ *
+ * NOTE: in general, the same frequencies can be used to generate different
+ * length-limited canonical Huffman codes. One choice we have is during tree
+ * construction, when we must decide whether to prefer a leaf or non-leaf when
+ * there is a tie in frequency. Another choice we have is how to deal with
+ * codewords that would exceed @max_codeword_len bits in length. Both of these
+ * choices affect the resulting codeword lengths, which otherwise can be mapped
+ * uniquely onto the resulting canonical Huffman code.
+ *
+ * Normally, there is no problem with choosing one valid code over another,
+ * provided that they produce similar compression ratios. However, the LZMS
+ * compression format uses adaptive Huffman coding. It requires that both the
+ * decompressor and compressor build a canonical code equivalent to that which
+ * can be generated by using the classic Huffman tree construction algorithm and
+ * always processing leaves before non-leaves when there is a frequency tie.
+ * Therefore, we make sure to do this. This method also has the advantage of
+ * sometimes shortening the longest codeword that is generated.
+ *
+ * There also is the issue of how codewords longer than @max_codeword_len are
+ * dealt with. Fortunately, for LZMS this is irrelevant because for the LZMS
+ * alphabets no codeword can ever exceed LZMS_MAX_CODEWORD_LEN (= 15). Since
+ * the LZMS algorithm regularly halves all frequencies, the frequencies cannot
+ * become high enough for a length 16 codeword to be generated. Specifically, I
+ * think that if ties are broken in favor of non-leaves (as we do), the lowest
+ * total frequency that would give a length-16 codeword would be the sum of the
+ * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which is
+ * 3570. And in LZMS we can't get a frequency that high based on the alphabet
+ * sizes, rebuild frequencies, and scaling factors. This worst-case scenario is
+ * based on the following degenerate case (only the bottom of the tree shown):
*
* ...
* 17
* / \
* 1 1
*
- * Excluding the first leaves (those with value 1), each leaf value must
- * be greater than the non-leaf up 1 and down 2 from it; otherwise that
- * leaf would have taken precedence over that non-leaf and been combined
- * with the leaf below, thereby decreasing the height compared to that
- * shown.
- *
- * Interesting fact: if we were to instead prioritize non-leaves over
- * leaves, then the worst case frequencies would be the Fibonacci
- * sequence, plus an extra frequency of 1. In this hypothetical
- * scenario, it would be slightly easier for longer codewords to be
- * generated.
+ * Excluding the first leaves (those with value 1), each leaf value must be
+ * greater than the non-leaf up 1 and down 2 from it; otherwise that leaf would
+ * have taken precedence over that non-leaf and been combined with the leaf
+ * below, thereby decreasing the height compared to that shown.
+ *
+ * Interesting fact: if we were to instead prioritize non-leaves over leaves,
+ * then the worst case frequencies would be the Fibonacci sequence, plus an
+ * extra frequency of 1. In this hypothetical scenario, it would be slightly
+ * easier for longer codewords to be generated.
*/
void
make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
- const u32 freqs[restrict],
- u8 lens[restrict], u32 codewords[restrict])
+ const u32 freqs[], u8 lens[], u32 codewords[])
{
u32 *A = codewords;
unsigned num_used_syms;
- /* We begin by sorting the symbols primarily by frequency and
- * secondarily by symbol value. As an optimization, the array
- * used for this purpose ('A') shares storage with the space in
- * which we will eventually return the codewords. */
+ wimlib_assert(num_syms <= MAX_NUM_SYMS);
+ STATIC_ASSERT(MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
+ wimlib_assert(max_codeword_len <= MAX_CODEWORD_LEN);
+ /*
+ * We begin by sorting the symbols primarily by frequency and
+ * secondarily by symbol value. As an optimization, the array used for
+ * this purpose ('A') shares storage with the space in which we will
+ * eventually return the codewords.
+ */
num_used_syms = sort_symbols(num_syms, freqs, lens, A);
- /* 'num_used_syms' is the number of symbols with nonzero
- * frequency. This may be less than @num_syms. 'num_used_syms'
- * is also the number of entries in 'A' that are valid. Each
- * entry consists of a distinct symbol and a nonzero frequency
- * packed into a 32-bit integer. */
+ /*
+ * 'num_used_syms' is the number of symbols with nonzero frequency.
+ * This may be less than @num_syms. 'num_used_syms' is also the number
+ * of entries in 'A' that are valid. Each entry consists of a distinct
+ * symbol and a nonzero frequency packed into a 32-bit integer.
+ */
- /* Handle special cases where only 0 or 1 symbols were used (had
- * nonzero frequency). */
+ /*
+ * Handle special cases where only 0 or 1 symbols were used (had nonzero
+ * frequency).
+ */
if (unlikely(num_used_syms == 0)) {
- /* Code is empty. sort_symbols() already set all lengths
- * to 0, so there is nothing more to do. */
+ /*
+ * Code is empty. sort_symbols() already set all lengths to 0,
+ * so there is nothing more to do.
+ */
return;
}
if (unlikely(num_used_syms == 1)) {
- /* Only one symbol was used, so we only need one
- * codeword. But two codewords are needed to form the
- * smallest complete Huffman code, which uses codewords 0
- * and 1. Therefore, we choose another symbol to which
- * to assign a codeword. We use 0 (if the used symbol is
- * not 0) or 1 (if the used symbol is 0). In either
- * case, the lesser-valued symbol must be assigned
- * codeword 0 so that the resulting code is canonical. */
+ /*
+ * Only one symbol was used, so we only need one codeword. But
+ * two codewords are needed to form the smallest complete
+ * Huffman code, which uses codewords 0 and 1. Therefore, we
+ * choose another symbol to which to assign a codeword. We use
+ * 0 (if the used symbol is not 0) or 1 (if the used symbol is
+ * 0). In either case, the lesser-valued symbol must be
+ * assigned codeword 0 so that the resulting code is canonical.
+ */
unsigned sym = A[0] & SYMBOL_MASK;
unsigned nonzero_idx = sym ? sym : 1;
return;
}
- /* Build a stripped-down version of the Huffman tree, sharing the
- * array 'A' with the symbol values. Then extract length counts
- * from the tree and use them to generate the final codewords. */
+ /*
+ * Build a stripped-down version of the Huffman tree, sharing the array
+ * 'A' with the symbol values. Then extract length counts from the tree
+ * and use them to generate the final codewords.
+ */
build_tree(A, num_used_syms);
{
- unsigned len_counts[max_codeword_len + 1];
+ unsigned len_counts[MAX_CODEWORD_LEN + 1];
compute_length_counts(A, num_used_syms - 2,
len_counts, max_codeword_len);