X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Fcompress_common.c;h=270b8cfe3b4ffd410a2cc02e0247e3327846f4a0;hp=9659d07b00d930f22badaaac6f5c9897c2f926b0;hb=fe548d263d477a745dfa5057f540cc5c35ecce89;hpb=883833a4b3dabec325edf1ca938000f91d587c00 diff --git a/src/compress_common.c b/src/compress_common.c index 9659d07b..270b8cfe 100644 --- a/src/compress_common.c +++ b/src/compress_common.c @@ -5,7 +5,7 @@ */ /* - * Copyright (C) 2012, 2013 Eric Biggers + * Copyright (C) 2012, 2013, 2014 Eric Biggers * * This file is part of wimlib, a library for working with WIM files. * @@ -33,7 +33,6 @@ #include "wimlib/compress_common.h" #include "wimlib/util.h" -#include #include /* Writes @num_bits bits, given by the @num_bits least significant bits of @@ -126,346 +125,627 @@ init_output_bitstream(struct output_bitstream *ostream, ostream->overrun = false; } -typedef struct { - input_idx_t freq; - u16 sym; - union { - u16 path_len; - u16 height; - }; -} HuffmanNode; - -typedef struct HuffmanIntermediateNode { - HuffmanNode node_base; - HuffmanNode *left_child; - HuffmanNode *right_child; -} HuffmanIntermediateNode; - - -/* Comparator function for HuffmanNodes. Sorts primarily by symbol - * frequency and secondarily by symbol value. */ -static int -cmp_nodes_by_freq(const void *_leaf1, const void *_leaf2) +/* Given the binary tree node A[subtree_idx] whose children already + * satisfy the maxheap property, swap the node with its greater child + * until it is greater than both its children, so that the maxheap + * property is satisfied in the subtree rooted at A[subtree_idx]. */ +static void +heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) { - const HuffmanNode *leaf1 = _leaf1; - const HuffmanNode *leaf2 = _leaf2; - - if (leaf1->freq > leaf2->freq) - return 1; - else if (leaf1->freq < leaf2->freq) - return -1; - else - return (int)leaf1->sym - (int)leaf2->sym; + unsigned parent_idx; + unsigned child_idx; + u32 v; + + v = A[subtree_idx]; + parent_idx = subtree_idx; + while ((child_idx = parent_idx * 2) <= length) { + if (child_idx < length && A[child_idx + 1] > A[child_idx]) + child_idx++; + if (v >= A[child_idx]) + break; + A[parent_idx] = A[child_idx]; + parent_idx = child_idx; + } + A[parent_idx] = v; } -/* Comparator function for HuffmanNodes. Sorts primarily by code length and - * secondarily by symbol value. */ -static int -cmp_nodes_by_code_len(const void *_leaf1, const void *_leaf2) +/* Rearrange the array 'A' so that it satisfies the maxheap property. + * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1]. + */ +static void +heapify_array(u32 A[], unsigned length) { - const HuffmanNode *leaf1 = _leaf1; - const HuffmanNode *leaf2 = _leaf2; - - int code_len_diff = (int)leaf1->path_len - (int)leaf2->path_len; - - if (code_len_diff == 0) - return (int)leaf1->sym - (int)leaf2->sym; - else - return code_len_diff; + for (unsigned subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) + heapify_subtree(A, length, subtree_idx); } -#define INVALID_SYMBOL 0xffff - -/* Recursive function to calculate the depth of the leaves in a Huffman tree. - * */ +/* Sort the array 'A', which contains 'length' unsigned 32-bit integers. */ static void -huffman_tree_compute_path_lengths(HuffmanNode *base_node, u16 cur_len) +heapsort(u32 A[], unsigned length) { - if (base_node->sym == INVALID_SYMBOL) { - /* Intermediate node. */ - HuffmanIntermediateNode *node = (HuffmanIntermediateNode*)base_node; - huffman_tree_compute_path_lengths(node->left_child, cur_len + 1); - huffman_tree_compute_path_lengths(node->right_child, cur_len + 1); - } else { - /* Leaf node. */ - base_node->path_len = cur_len; + A--; /* Use 1-based indices */ + + heapify_array(A, length); + + while (length >= 2) { + swap(A[1], A[length]); + length--; + heapify_subtree(A, length, 1); } } -/* make_canonical_huffman_code: - Creates a canonical Huffman code from an array - * of symbol frequencies. - * - * The algorithm used is similar to the well-known algorithm that builds a - * Huffman tree using a minheap. In that algorithm, the leaf nodes are - * initialized and inserted into the minheap with the frequency as the key. - * Repeatedly, the top two nodes (nodes with the lowest frequency) are taken out - * of the heap and made the children of a new node that has a frequency equal to - * the sum of the two frequencies of its children. This new node is inserted - * into the heap. When all the nodes have been removed from the heap, what - * remains is the Huffman tree. The Huffman code for a symbol is given by the - * path to it in the tree, where each left pointer is mapped to a 0 bit and each - * right pointer is mapped to a 1 bit. - * - * The algorithm used here uses an optimization that removes the need to - * actually use a heap. The leaf nodes are first sorted by frequency, as - * opposed to being made into a heap. Note that this sorting step takes O(n log - * n) time vs. O(n) time for heapifying the array, where n is the number of - * symbols. However, the heapless method is probably faster overall, due to the - * time saved later. In the heapless method, whenever an intermediate node is - * created, it is not inserted into the sorted array. Instead, the intermediate - * nodes are kept in a separate array, which is easily kept sorted because every - * time an intermediate node is initialized, it will have a frequency at least - * as high as that of the previous intermediate node that was initialized. So - * whenever we want the 2 nodes, leaf or intermediate, that have the lowest - * frequency, we check the low-frequency ends of both arrays, which is an O(1) - * operation. - * - * The function builds a canonical Huffman code, not just any Huffman code. A - * Huffman code is canonical if the codeword for each symbol numerically - * precedes the codeword for all other symbols of the same length that are - * numbered higher than the symbol, and additionally, all shorter codewords, - * 0-extended, numerically precede longer codewords. A canonical Huffman code - * is useful because it can be reconstructed by only knowing the path lengths in - * the tree. See the make_huffman_decode_table() function to see how to - * reconstruct a canonical Huffman code from only the lengths of the codes. - * - * @num_syms: The number of symbols in the alphabet. - * - * @max_codeword_len: The maximum allowed length of a codeword in the code. - * Note that if the code being created runs up against - * this restriction, the code ultimately created will be - * suboptimal, although there are some advantages for - * limiting the length of the codewords. - * - * @freq_tab: An array of length @num_syms that contains the frequencies - * of each symbol in the uncompressed data. - * - * @lens: An array of length @num_syms into which the lengths of the - * codewords for each symbol will be written. - * - * @codewords: An array of @num_syms short integers into which the - * codewords for each symbol will be written. The first - * lens[i] bits of codewords[i] will contain the codeword - * for symbol i. +#define NUM_SYMBOL_BITS 10 +#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) + +/* + * Sort the symbols primarily by frequency and secondarily by symbol + * value. Discard symbols with zero frequency and fill in an array with + * the remaining symbols, along with their frequencies. The low + * NUM_SYMBOL_BITS bits of each array entry will contain the symbol + * value, and the remaining bits will contain the frequency. + * + * @num_syms + * Number of symbols in the alphabet. + * Can't be greater than (1 << NUM_SYMBOL_BITS). + * + * @freqs[num_syms] + * The frequency of each symbol. + * + * @lens[num_syms] + * An array that eventually will hold the length of each codeword. + * This function only fills in the codeword lengths for symbols that + * have zero frequency, which are not well defined per se but will + * be set to 0. + * + * @symout[num_syms] + * The output array, described above. + * + * Returns the number of entries in 'symout' that were filled. This is + * the number of symbols that have nonzero frequency. */ -void -make_canonical_huffman_code(unsigned num_syms, - unsigned max_codeword_len, - const input_idx_t freq_tab[restrict], - u8 lens[restrict], - u16 codewords[restrict]) +static unsigned +sort_symbols(unsigned num_syms, const u32 freqs[restrict], + u8 lens[restrict], u32 symout[restrict]) { - /* We require at least 2 possible symbols in the alphabet to produce a - * valid Huffman decoding table. It is allowed that fewer than 2 symbols - * are actually used, though. */ - wimlib_assert(num_syms >= 2 && num_syms < INVALID_SYMBOL); - - /* Initialize the lengths and codewords to 0 */ - memset(lens, 0, num_syms * sizeof(lens[0])); - memset(codewords, 0, num_syms * sizeof(codewords[0])); - - /* Calculate how many symbols have non-zero frequency. These are the - * symbols that actually appeared in the input. */ - unsigned num_used_symbols = 0; - for (unsigned i = 0; i < num_syms; i++) - if (freq_tab[i] != 0) - num_used_symbols++; - - - /* It is impossible to make a code for num_used_symbols symbols if there - * aren't enough code bits to uniquely represent all of them. */ - wimlib_assert((1 << max_codeword_len) > num_used_symbols); - - /* Initialize the array of leaf nodes with the symbols and their - * frequencies. */ - HuffmanNode leaves[num_used_symbols]; - unsigned leaf_idx = 0; - for (unsigned i = 0; i < num_syms; i++) { - if (freq_tab[i] != 0) { - leaves[leaf_idx].freq = freq_tab[i]; - leaves[leaf_idx].sym = i; - leaves[leaf_idx].height = 0; - leaf_idx++; - } + unsigned num_used_syms; + unsigned num_counters; + + /* We rely on heapsort, but with an added optimization. Since + * it's common for most symbol frequencies to be low, we first do + * a count sort using a limited number of counters. High + * frequencies will be counted in the last counter, and only they + * will be sorted with heapsort. + * + * Note: with more symbols, it is generally beneficial to have more + * counters. About 1 counter per 4 symbols seems fast. + * + * Note: I also tested radix sort, but even for large symbol + * counts (> 255) and frequencies bounded at 16 bits (enabling + * radix sort by just two base-256 digits), it didn't seem any + * faster than the method implemented here. + * + * Note: I tested the optimized quicksort implementation from + * glibc (with indirection overhead removed), but it was only + * marginally faster than the simple heapsort implemented here. + * + * Tests were done with building the codes for LZX. Results may + * vary for different compression algorithms...! */ + + num_counters = (DIV_ROUND_UP(num_syms, 4) + 3) & ~3; + + unsigned counters[num_counters]; + + memset(counters, 0, sizeof(counters)); + + /* Count the frequencies. */ + for (unsigned sym = 0; sym < num_syms; sym++) + counters[min(freqs[sym], num_counters - 1)]++; + + /* Make the counters cumulative, ignoring the zero-th, which + * counted symbols with zero frequency. As a side effect, this + * calculates the number of symbols with nonzero frequency. */ + num_used_syms = 0; + for (unsigned i = 1; i < num_counters; i++) { + unsigned count = counters[i]; + counters[i] = num_used_syms; + num_used_syms += count; } - /* Deal with the special cases where num_used_symbols < 2. */ - if (num_used_symbols < 2) { - if (num_used_symbols == 0) { - /* If num_used_symbols is 0, there are no symbols in the - * input, so it must be empty. This should be an error, - * but the LZX format expects this case to succeed. All - * the codeword lengths are simply marked as 0 (which - * was already done.) */ + /* Sort nonzero-frequency symbols using the counters. At the + * same time, set the codeword lengths of zero-frequency symbols + * to 0. */ + for (unsigned sym = 0; sym < num_syms; sym++) { + u32 freq = freqs[sym]; + if (freq != 0) { + symout[counters[min(freq, num_counters - 1)]++] = + sym | (freq << NUM_SYMBOL_BITS); } else { - /* If only one symbol is present, the LZX format - * requires that the Huffman code include two codewords. - * One is not used. Note that this doesn't make the - * encoded data take up more room anyway, since binary - * data itself has 2 symbols. */ - - unsigned sym = leaves[0].sym; - - codewords[0] = 0; - lens[0] = 1; - if (sym == 0) { - /* dummy symbol is 1, real symbol is 0 */ - codewords[1] = 1; - lens[1] = 1; - } else { - /* dummy symbol is 0, real symbol is sym */ - codewords[sym] = 1; - lens[sym] = 1; - } + lens[sym] = 0; } - return; } - /* Otherwise, there are at least 2 symbols in the input, so we need to - * find a real Huffman code. */ + /* Sort the symbols counted in the last counter. */ + heapsort(symout + counters[num_counters - 2], + counters[num_counters - 1] - counters[num_counters - 2]); + + return num_used_syms; +} +/* + * Build the Huffman tree. + * + * This is an optimized implementation that + * (a) takes advantage of the frequencies being already sorted; + * (b) only generates non-leaf nodes, since the non-leaf nodes of a + * Huffman tree are sufficient to generate a canonical code; + * (c) Only stores parent pointers, not child pointers; + * (d) Produces the nodes in the same memory used for input + * frequency information. + * + * Array 'A', which contains 'sym_count' entries, is used for both input + * and output. For this function, 'sym_count' must be at least 2. + * + * For input, the array must contain the frequencies of the symbols, + * sorted in increasing order. Specifically, each entry must contain a + * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low + * NUM_SYMBOL_BITS bits of the entries will be ignored by this function. + * Although these bits will, in fact, contain the symbols that correspond + * to the frequencies, this function is concerned with frequencies only + * and keeps the symbols as-is. + * + * For output, this function will produce the non-leaf nodes of the + * Huffman tree. These nodes will be stored in the first (sym_count - 1) + * entries of the array. Entry A[sym_count - 2] will represent the root + * node. Each other node will contain the zero-based index of its parent + * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low + * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again, + * note that although these low bits will, in fact, contain a symbol + * value, this symbol will have *no relationship* with the Huffman tree + * node that happens to occupy the same slot. This is because this + * implementation only generates the non-leaf nodes of the tree. + */ +static void +build_tree(u32 A[], unsigned sym_count) +{ + /* Index, in 'A', of next lowest frequency symbol that has not + * yet been processed. */ + unsigned i = 0; + + /* Index, in 'A', of next lowest frequency parentless non-leaf + * node; or, if equal to 'e', then no such node exists yet. */ + unsigned b = 0; + + /* Index, in 'A', of next node to allocate as a non-leaf. */ + unsigned e = 0; + + do { + unsigned m, n; + u32 freq_shifted; + + /* Choose the two next lowest frequency entries. */ + + if (i != sym_count && + (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) + m = i++; + else + m = b++; + + if (i != sym_count && + (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) + n = i++; + else + n = b++; + + /* Allocate a non-leaf node and link the entries to it. + * + * If we link an entry that we're visiting for the first + * time (via index 'i'), then we're actually linking a + * leaf node and it will have no effect, since the leaf + * will be overwritten with a non-leaf when index 'e' + * catches up to it. But it's not any slower to + * unconditionally set the parent index. + * + * We also compute the frequency of the non-leaf node as + * the sum of its two children's frequencies. */ + + freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK); + + A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); + A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); + A[e] = (A[e] & SYMBOL_MASK) | freq_shifted; + e++; + } while (sym_count - e > 1); + /* When just one entry remains, it is a "leaf" that was + * linked to some other node. We ignore it, since the + * rest of the array contains the non-leaves which we + * need. (Note that we're assuming the cases with 0 or 1 + * symbols were handled separately.) */ +} - /* Declare the array of intermediate nodes. An intermediate node is not - * associated with a symbol. Instead, it represents some binary code - * prefix that is shared between at least 2 codewords. There can be at - * most num_used_symbols - 1 intermediate nodes when creating a Huffman - * code. This is because if there were at least num_used_symbols nodes, - * the code would be suboptimal because there would be at least one - * unnecessary intermediate node. +/* + * Given the stripped-down Huffman tree constructed by build_tree(), + * determine the number of codewords that should be assigned each + * possible length, taking into account the length-limited constraint. + * + * @A + * The array produced by build_tree(), containing parent index + * information for the non-leaf nodes of the Huffman tree. Each + * entry in this array is a node; a node's parent always has a + * greater index than that node itself. This function will + * overwrite the parent index information in this array, so + * essentially it will destroy the tree. However, the data in the + * low NUM_SYMBOL_BITS of each entry will be preserved. + * + * @root_idx + * The 0-based index of the root node in 'A', and consequently one + * less than the number of tree node entries in 'A'. (Or, really 2 + * less than the actual length of 'A'.) + * + * @len_counts + * An array of length ('max_codeword_len' + 1) in which the number of + * codewords having each length <= max_codeword_len will be + * returned. + * + * @max_codeword_len + * The maximum permissible codeword length. + */ +static void +compute_length_counts(u32 A[restrict], unsigned root_idx, + unsigned len_counts[restrict], unsigned max_codeword_len) +{ + /* The key observations are: * - * The worst case (greatest number of intermediate nodes) would be if - * all the intermediate nodes were chained together. This results in - * num_used_symbols - 1 intermediate nodes. If num_used_symbols is at - * least 17, this configuration would not be allowed because the LZX - * format constrains codes to 16 bits or less each. However, it is - * still possible for there to be more than 16 intermediate nodes, as - * long as no leaf has a depth of more than 16. */ - HuffmanIntermediateNode inodes[num_used_symbols - 1]; - - - /* Pointer to the leaf node of lowest frequency that hasn't already been - * added as the child of some intermediate note. */ - HuffmanNode *cur_leaf; - - /* Pointer past the end of the array of leaves. */ - HuffmanNode *end_leaf = &leaves[num_used_symbols]; - - /* Pointer to the intermediate node of lowest frequency. */ - HuffmanIntermediateNode *cur_inode; - - /* Pointer to the next unallocated intermediate node. */ - HuffmanIntermediateNode *next_inode; - - /* Only jump back to here if the maximum length of the codewords allowed - * by the LZX format (16 bits) is exceeded. */ -try_building_tree_again: - - /* Sort the leaves from those that correspond to the least frequent - * symbol, to those that correspond to the most frequent symbol. If two - * leaves have the same frequency, they are sorted by symbol. */ - qsort(leaves, num_used_symbols, sizeof(leaves[0]), cmp_nodes_by_freq); - - cur_leaf = &leaves[0]; - cur_inode = &inodes[0]; - next_inode = &inodes[0]; - - /* The following loop takes the two lowest frequency nodes of those - * remaining and makes them the children of the next available - * intermediate node. It continues until all the leaf nodes and - * intermediate nodes have been used up, or the maximum allowed length - * for the codewords is exceeded. For the latter case, we must adjust - * the frequencies to be more equal and then execute this loop again. */ - while (1) { - - /* Lowest frequency node. */ - HuffmanNode *f1; - - /* Second lowest frequency node. */ - HuffmanNode *f2; - - /* Get the lowest and second lowest frequency nodes from the - * remaining leaves or from the intermediate nodes. */ - - if (cur_leaf != end_leaf && (cur_inode == next_inode || - cur_leaf->freq <= cur_inode->node_base.freq)) { - f1 = cur_leaf++; - } else if (cur_inode != next_inode) { - f1 = (HuffmanNode*)cur_inode++; - } + * (1) We can traverse the non-leaf nodes of the tree, always + * visiting a parent before its children, by simply iterating + * through the array in reverse order. Consequently, we can + * compute the depth of each node in one pass, overwriting the + * parent indices with depths. + * + * (2) We can initially assume that in the real Huffman tree, + * both children of the root are leaves. This corresponds to two + * codewords of length 1. Then, whenever we visit a (non-leaf) + * node during the traversal, we modify this assumption to + * account for the current node *not* being a leaf, but rather + * its two children being leaves. This causes the loss of one + * codeword for the current depth and the addition of two + * codewords for the current depth plus one. + * + * (3) We can handle the length-limited constraint fairly easily + * by simply using the largest length available when a depth + * exceeds max_codeword_len. + */ - if (cur_leaf != end_leaf && (cur_inode == next_inode || - cur_leaf->freq <= cur_inode->node_base.freq)) { - f2 = cur_leaf++; - } else if (cur_inode != next_inode) { - f2 = (HuffmanNode*)cur_inode++; - } else { - /* All nodes used up! */ - break; - } + for (unsigned len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + len_counts[1] = 2; - /* next_inode becomes the parent of f1 and f2. */ - - next_inode->node_base.freq = f1->freq + f2->freq; - next_inode->node_base.sym = INVALID_SYMBOL; - next_inode->left_child = f1; - next_inode->right_child = f2; - - /* We need to keep track of the height so that we can detect if - * the length of a codeword has execeed max_codeword_len. The - * parent node has a height one higher than the maximum height - * of its children. */ - next_inode->node_base.height = max(f1->height, f2->height) + 1; - - /* Check to see if the code length of the leaf farthest away - * from next_inode has exceeded the maximum code length. */ - if (next_inode->node_base.height > max_codeword_len) { - /* The code lengths can be made more uniform by making - * the frequencies more uniform. Divide all the - * frequencies by 2, leaving 1 as the minimum frequency. - * If this keeps happening, the symbol frequencies will - * approach equality, which makes their Huffman - * codewords approach the length - * log_2(num_used_symbols). - * */ - for (unsigned i = 0; i < num_used_symbols; i++) - leaves[i].freq = (leaves[i].freq + 1) >> 1; - - goto try_building_tree_again; + /* Set the root node's depth to 0. */ + A[root_idx] &= SYMBOL_MASK; + + for (int node = root_idx - 1; node >= 0; node--) { + + /* Calculate the depth of this node. */ + + unsigned parent = A[node] >> NUM_SYMBOL_BITS; + unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; + unsigned depth = parent_depth + 1; + unsigned len = depth; + + /* Set the depth of this node so that it is available + * when its children (if any) are processed. */ + + A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); + + /* If needed, decrease the length to meet the + * length-limited constraint. This is not the optimal + * method for generating length-limited Huffman codes! + * But it should be good enough. */ + if (len >= max_codeword_len) { + len = max_codeword_len; + do { + len--; + } while (len_counts[len] == 0); } - next_inode++; + + /* Account for the fact that we have a non-leaf node at + * the current depth. */ + len_counts[len]--; + len_counts[len + 1] += 2; } +} - /* The Huffman tree is now complete, and its height is no more than - * max_codeword_len. */ +/* + * Generate the codewords for a canonical Huffman code. + * + * @A + * The output array for codewords. In addition, initially this + * array must contain the symbols, sorted primarily by frequency and + * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of + * each entry. + * + * @len + * Output array for codeword lengths. + * + * @len_counts + * An array that provides the number of codewords that will have + * each possible length <= max_codeword_len. + * + * @max_codeword_len + * Maximum length, in bits, of each codeword. + * + * @num_syms + * Number of symbols in the alphabet, including symbols with zero + * frequency. This is the length of the 'A' and 'len' arrays. + */ +static void +gen_codewords(u32 A[restrict], u8 lens[restrict], + const unsigned len_counts[restrict], + unsigned max_codeword_len, unsigned num_syms) +{ + u32 next_codewords[max_codeword_len + 1]; + + /* Given the number of codewords that will have each length, + * assign codeword lengths to symbols. We do this by assigning + * the lengths in decreasing order to the symbols sorted + * primarily by increasing frequency and secondarily by + * increasing symbol value. */ + for (unsigned i = 0, len = max_codeword_len; len >= 1; len--) { + unsigned count = len_counts[len]; + while (count--) + lens[A[i++] & SYMBOL_MASK] = len; + } - HuffmanIntermediateNode *root = next_inode - 1; - wimlib_assert(root->node_base.height <= max_codeword_len); + /* Generate the codewords themselves. We initialize the + * 'next_codewords' array to provide the lexicographically first + * codeword of each length, then assign codewords in symbol + * order. This produces a canonical code. */ + next_codewords[0] = 0; + next_codewords[1] = 0; + for (unsigned len = 2; len <= max_codeword_len; len++) + next_codewords[len] = + (next_codewords[len - 1] + len_counts[len - 1]) << 1; + + for (unsigned sym = 0; sym < num_syms; sym++) + A[sym] = next_codewords[lens[sym]]++; +} - /* Compute the path lengths for the leaf nodes. */ - huffman_tree_compute_path_lengths(&root->node_base, 0); +/* + * --------------------------------------------------------------------- + * make_canonical_huffman_code() + * --------------------------------------------------------------------- + * + * Given an alphabet and the frequency of each symbol in it, construct a + * length-limited canonical Huffman code. + * + * @num_syms + * The number of symbols in the alphabet. The symbols are the + * integers in the range [0, num_syms - 1]. This parameter must be + * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS). + * + * @max_codeword_len + * The maximum permissible codeword length. + * + * @freqs + * An array of @num_syms entries, each of which specifies the + * frequency of the corresponding symbol. It is valid for some, + * none, or all of the frequencies to be 0. + * + * @lens + * An array of @num_syms entries in which this function will return + * the length, in bits, of the codeword assigned to each symbol. + * Symbols with 0 frequency will not have codewords per se, but + * their entries in this array will be set to 0. No lengths greater + * than @max_codeword_len will be assigned. + * + * @codewords + * An array of @num_syms entries in which this function will return + * the codeword for each symbol, right-justified and padded on the + * left with zeroes. Codewords for symbols with 0 frequency will be + * undefined. + * + * --------------------------------------------------------------------- + * + * This function builds a length-limited canonical Huffman code. + * + * A length-limited Huffman code contains no codewords longer than some + * specified length, and has exactly (with some algorithms) or + * approximately (with the algorithm used here) the minimum weighted path + * length from the root, given this constraint. + * + * A canonical Huffman code satisfies the properties that a longer + * codeword never lexicographically precedes a shorter codeword, and the + * lexicographic ordering of codewords of the same length is the same as + * the lexicographic ordering of the corresponding symbols. A canonical + * Huffman code, or more generally a canonical prefix code, can be + * reconstructed from only a list containing the codeword length of each + * symbol. + * + * The classic algorithm to generate a Huffman code creates a node for + * each symbol, then inserts these nodes into a min-heap keyed by symbol + * frequency. Then, repeatedly, the two lowest-frequency nodes are + * removed from the min-heap and added as the children of a new node + * having frequency equal to the sum of its two children, which is then + * inserted into the min-heap. When only a single node remains in the + * min-heap, it is the root of the Huffman tree. The codeword for each + * symbol is determined by the path needed to reach the corresponding + * node from the root. Descending to the left child appends a 0 bit, + * whereas descending to the right child appends a 1 bit. + * + * The classic algorithm is relatively easy to understand, but it is + * subject to a number of inefficiencies. In practice, it is fastest to + * first sort the symbols by frequency. (This itself can be subject to + * an optimization based on the fact that most frequencies tend to be + * low.) At the same time, we sort secondarily by symbol value, which + * aids the process of generating a canonical code. Then, during tree + * construction, no heap is necessary because both the leaf nodes and the + * unparented non-leaf nodes can be easily maintained in sorted order. + * Consequently, there can never be more than two possibilities for the + * next-lowest-frequency node. + * + * In addition, because we're generating a canonical code, we actually + * don't need the leaf nodes of the tree at all, only the non-leaf nodes. + * This is because for canonical code generation we don't need to know + * where the symbols are in the tree. Rather, we only need to know how + * many leaf nodes have each depth (codeword length). And this + * information can, in fact, be quickly generated from the tree of + * non-leaves only. + * + * Furthermore, we can build this stripped-down Huffman tree directly in + * the array in which the codewords are to be generated, provided that + * these array slots are large enough to hold a symbol and frequency + * value. + * + * Still furthermore, we don't even need to maintain explicit child + * pointers. We only need the parent pointers, and even those can be + * overwritten in-place with depth information as part of the process of + * extracting codeword lengths from the tree. So in summary, we do NOT + * need a big structure like: + * + * struct huffman_tree_node { + * unsigned int symbol; + * unsigned int frequency; + * unsigned int depth; + * struct huffman_tree_node *left_child; + * struct huffman_tree_node *right_child; + * }; + * + * + * ... which often gets used in "naive" implementations of Huffman code + * generation. + * + * Most of these optimizations are based on the implementation in 7-Zip + * (source file: C/HuffEnc.c), which has been placed in the public domain + * by Igor Pavlov. But I've rewritten the code with extensive comments, + * as it took me a while to figure out what it was doing...! + * + * --------------------------------------------------------------------- + * + * NOTE: in general, the same frequencies can be used to generate + * different length-limited canonical Huffman codes. One choice we have + * is during tree construction, when we must decide whether to prefer a + * leaf or non-leaf when there is a tie in frequency. Another choice we + * have is how to deal with codewords that would exceed @max_codeword_len + * bits in length. Both of these choices affect the resulting codeword + * lengths, which otherwise can be mapped uniquely onto the resulting + * canonical Huffman code. + * + * Normally, there is no problem with choosing one valid code over + * another, provided that they produce similar compression ratios. + * However, the LZMS compression format uses adaptive Huffman coding. It + * requires that both the decompressor and compressor build a canonical + * code equivalent to that which can be generated by using the classic + * Huffman tree construction algorithm and always processing leaves + * before non-leaves when there is a frequency tie. Therefore, we make + * sure to do this. This method also has the advantage of sometimes + * shortening the longest codeword that is generated. + * + * There also is the issue of how codewords longer than @max_codeword_len + * are dealt with. Fortunately, for LZMS this is irrelevant because + * because for the LZMS alphabets no codeword can ever exceed + * LZMS_MAX_CODEWORD_LEN (= 15). Since the LZMS algorithm regularly + * halves all frequencies, the frequencies cannot become high enough for + * a length 16 codeword to be generated. Specifically, I think that if + * ties are broken in favor of non-leaves (as we do), the lowest total + * frequency that would give a length-16 codeword would be the sum of the + * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which + * is 3570. And in LZMS we can't get a frequency that high based on the + * alphabet sizes, rebuild frequencies, and scaling factors. This + * worst-case scenario is based on the following degenerate case (only + * the bottom of the tree shown): + * + * ... + * 17 + * / \ + * 10 7 + * / \ + * 6 4 + * / \ + * 3 3 + * / \ + * 2 1 + * / \ + * 1 1 + * + * Excluding the first leaves (those with value 1), each leaf value must + * be greater than the non-leaf up 1 and down 2 from it; otherwise that + * leaf would have taken precedence over that non-leaf and been combined + * with the leaf below, thereby decreasing the height compared to that + * shown. + * + * Interesting fact: if we were to instead prioritize non-leaves over + * leaves, then the worst case frequencies would be the Fibonacci + * sequence, plus an extra frequency of 1. In this hypothetical + * scenario, it would be slightly easier for longer codewords to be + * generated. + */ +void +make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len, + const u32 freqs[restrict], + u8 lens[restrict], u32 codewords[restrict]) +{ + u32 *A = codewords; + unsigned num_used_syms; + + /* Assumptions */ + wimlib_assert2(num_syms >= 2); + wimlib_assert2(num_syms <= (1 << NUM_SYMBOL_BITS)); + wimlib_assert2(max_codeword_len > 0); + wimlib_assert2(max_codeword_len <= 32); + + /* We begin by sorting the symbols primarily by frequency and + * secondarily by symbol value. As an optimization, the array + * used for this purpose ('A') shares storage with the space in + * which we will eventually return the codewords. */ + + num_used_syms = sort_symbols(num_syms, freqs, lens, A); + + /* 'num_used_syms' is the number of symbols with nonzero + * frequency. This may be less than @num_syms. 'num_used_syms' + * is also the number of entries in 'A' that are valid. Each + * entry consists of a distinct symbol and a nonzero frequency + * packed into a 32-bit integer. */ + + /* Handle special cases where only 0 or 1 symbols were used (had + * nonzero frequency). */ + + if (unlikely(num_used_syms == 0)) { + /* Code is empty. sort_symbols() already set all lengths + * to 0, so there is nothing more to do. */ + return; + } - /* Sort the leaf nodes primarily by code length and secondarily by - * symbol. */ - qsort(leaves, num_used_symbols, sizeof(leaves[0]), cmp_nodes_by_code_len); + if (unlikely(num_used_syms == 1)) { + /* Only one symbol was used, so we only need one + * codeword. But two codewords are needed to form the + * smallest complete Huffman code, which uses codewords 0 + * and 1. Therefore, we choose another symbol to which + * to assign a codeword. We use 0 (if the used symbol is + * not 0) or 1 (if the used symbol is 0). In either + * case, the lesser-valued symbol must be assigned + * codeword 0 so that the resulting code is canonical. */ + + unsigned sym = A[0] & SYMBOL_MASK; + unsigned nonzero_idx = sym ? sym : 1; + + codewords[0] = 0; + lens[0] = 1; + codewords[nonzero_idx] = 1; + lens[nonzero_idx] = 1; + return; + } - u16 cur_codeword = 0; - unsigned cur_codeword_len = 0; - for (unsigned i = 0; i < num_used_symbols; i++) { + /* Build a stripped-down version of the Huffman tree, sharing the + * array 'A' with the symbol values. Then extract length counts + * from the tree and use them to generate the final codewords. */ - /* Each time a codeword becomes one longer, the current codeword - * is left shifted by one place. This is part of the procedure - * for enumerating the canonical Huffman code. Additionally, - * whenever a codeword is used, 1 is added to the current - * codeword. */ + build_tree(A, num_used_syms); - unsigned len_diff = leaves[i].path_len - cur_codeword_len; - cur_codeword <<= len_diff; - cur_codeword_len += len_diff; + { + unsigned len_counts[max_codeword_len + 1]; - u16 sym = leaves[i].sym; - codewords[sym] = cur_codeword; - lens[sym] = cur_codeword_len; + compute_length_counts(A, num_used_syms - 2, + len_counts, max_codeword_len); - cur_codeword++; + gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); } }