/*
* compress_common.c
*
* Code for compression shared among multiple compression formats.
*
* Author: Eric Biggers
* Year: 2012 - 2014
*
* The author dedicates this file to the public domain.
* You can do whatever you want with this file.
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include
#include "wimlib/compress_common.h"
#include "wimlib/util.h"
/* Given the binary tree node A[subtree_idx] whose children already
* satisfy the maxheap property, swap the node with its greater child
* until it is greater than both its children, so that the maxheap
* property is satisfied in the subtree rooted at A[subtree_idx]. */
static void
heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
{
unsigned parent_idx;
unsigned child_idx;
u32 v;
v = A[subtree_idx];
parent_idx = subtree_idx;
while ((child_idx = parent_idx * 2) <= length) {
if (child_idx < length && A[child_idx + 1] > A[child_idx])
child_idx++;
if (v >= A[child_idx])
break;
A[parent_idx] = A[child_idx];
parent_idx = child_idx;
}
A[parent_idx] = v;
}
/* Rearrange the array 'A' so that it satisfies the maxheap property.
* 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
*/
static void
heapify_array(u32 A[], unsigned length)
{
for (unsigned subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
heapify_subtree(A, length, subtree_idx);
}
/* Sort the array 'A', which contains 'length' unsigned 32-bit integers. */
static void
heapsort(u32 A[], unsigned length)
{
A--; /* Use 1-based indices */
heapify_array(A, length);
while (length >= 2) {
swap(A[1], A[length]);
length--;
heapify_subtree(A, length, 1);
}
}
#define NUM_SYMBOL_BITS 10
#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
/*
* Sort the symbols primarily by frequency and secondarily by symbol
* value. Discard symbols with zero frequency and fill in an array with
* the remaining symbols, along with their frequencies. The low
* NUM_SYMBOL_BITS bits of each array entry will contain the symbol
* value, and the remaining bits will contain the frequency.
*
* @num_syms
* Number of symbols in the alphabet.
* Can't be greater than (1 << NUM_SYMBOL_BITS).
*
* @freqs[num_syms]
* The frequency of each symbol.
*
* @lens[num_syms]
* An array that eventually will hold the length of each codeword.
* This function only fills in the codeword lengths for symbols that
* have zero frequency, which are not well defined per se but will
* be set to 0.
*
* @symout[num_syms]
* The output array, described above.
*
* Returns the number of entries in 'symout' that were filled. This is
* the number of symbols that have nonzero frequency.
*/
static unsigned
sort_symbols(unsigned num_syms, const u32 freqs[restrict],
u8 lens[restrict], u32 symout[restrict])
{
unsigned num_used_syms;
unsigned num_counters;
/* We rely on heapsort, but with an added optimization. Since
* it's common for most symbol frequencies to be low, we first do
* a count sort using a limited number of counters. High
* frequencies will be counted in the last counter, and only they
* will be sorted with heapsort.
*
* Note: with more symbols, it is generally beneficial to have more
* counters. About 1 counter per 4 symbols seems fast.
*
* Note: I also tested radix sort, but even for large symbol
* counts (> 255) and frequencies bounded at 16 bits (enabling
* radix sort by just two base-256 digits), it didn't seem any
* faster than the method implemented here.
*
* Note: I tested the optimized quicksort implementation from
* glibc (with indirection overhead removed), but it was only
* marginally faster than the simple heapsort implemented here.
*
* Tests were done with building the codes for LZX. Results may
* vary for different compression algorithms...! */
num_counters = ALIGN(DIV_ROUND_UP(num_syms, 4), 4);
unsigned counters[num_counters];
memset(counters, 0, sizeof(counters));
/* Count the frequencies. */
for (unsigned sym = 0; sym < num_syms; sym++)
counters[min(freqs[sym], num_counters - 1)]++;
/* Make the counters cumulative, ignoring the zero-th, which
* counted symbols with zero frequency. As a side effect, this
* calculates the number of symbols with nonzero frequency. */
num_used_syms = 0;
for (unsigned i = 1; i < num_counters; i++) {
unsigned count = counters[i];
counters[i] = num_used_syms;
num_used_syms += count;
}
/* Sort nonzero-frequency symbols using the counters. At the
* same time, set the codeword lengths of zero-frequency symbols
* to 0. */
for (unsigned sym = 0; sym < num_syms; sym++) {
u32 freq = freqs[sym];
if (freq != 0) {
symout[counters[min(freq, num_counters - 1)]++] =
sym | (freq << NUM_SYMBOL_BITS);
} else {
lens[sym] = 0;
}
}
/* Sort the symbols counted in the last counter. */
heapsort(symout + counters[num_counters - 2],
counters[num_counters - 1] - counters[num_counters - 2]);
return num_used_syms;
}
/*
* Build the Huffman tree.
*
* This is an optimized implementation that
* (a) takes advantage of the frequencies being already sorted;
* (b) only generates non-leaf nodes, since the non-leaf nodes of a
* Huffman tree are sufficient to generate a canonical code;
* (c) Only stores parent pointers, not child pointers;
* (d) Produces the nodes in the same memory used for input
* frequency information.
*
* Array 'A', which contains 'sym_count' entries, is used for both input
* and output. For this function, 'sym_count' must be at least 2.
*
* For input, the array must contain the frequencies of the symbols,
* sorted in increasing order. Specifically, each entry must contain a
* frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low
* NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
* Although these bits will, in fact, contain the symbols that correspond
* to the frequencies, this function is concerned with frequencies only
* and keeps the symbols as-is.
*
* For output, this function will produce the non-leaf nodes of the
* Huffman tree. These nodes will be stored in the first (sym_count - 1)
* entries of the array. Entry A[sym_count - 2] will represent the root
* node. Each other node will contain the zero-based index of its parent
* node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low
* NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again,
* note that although these low bits will, in fact, contain a symbol
* value, this symbol will have *no relationship* with the Huffman tree
* node that happens to occupy the same slot. This is because this
* implementation only generates the non-leaf nodes of the tree.
*/
static void
build_tree(u32 A[], unsigned sym_count)
{
/* Index, in 'A', of next lowest frequency symbol that has not
* yet been processed. */
unsigned i = 0;
/* Index, in 'A', of next lowest frequency parentless non-leaf
* node; or, if equal to 'e', then no such node exists yet. */
unsigned b = 0;
/* Index, in 'A', of next node to allocate as a non-leaf. */
unsigned e = 0;
do {
unsigned m, n;
u32 freq_shifted;
/* Choose the two next lowest frequency entries. */
if (i != sym_count &&
(b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
m = i++;
else
m = b++;
if (i != sym_count &&
(b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
n = i++;
else
n = b++;
/* Allocate a non-leaf node and link the entries to it.
*
* If we link an entry that we're visiting for the first
* time (via index 'i'), then we're actually linking a
* leaf node and it will have no effect, since the leaf
* will be overwritten with a non-leaf when index 'e'
* catches up to it. But it's not any slower to
* unconditionally set the parent index.
*
* We also compute the frequency of the non-leaf node as
* the sum of its two children's frequencies. */
freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
e++;
} while (sym_count - e > 1);
/* When just one entry remains, it is a "leaf" that was
* linked to some other node. We ignore it, since the
* rest of the array contains the non-leaves which we
* need. (Note that we're assuming the cases with 0 or 1
* symbols were handled separately.) */
}
/*
* Given the stripped-down Huffman tree constructed by build_tree(),
* determine the number of codewords that should be assigned each
* possible length, taking into account the length-limited constraint.
*
* @A
* The array produced by build_tree(), containing parent index
* information for the non-leaf nodes of the Huffman tree. Each
* entry in this array is a node; a node's parent always has a
* greater index than that node itself. This function will
* overwrite the parent index information in this array, so
* essentially it will destroy the tree. However, the data in the
* low NUM_SYMBOL_BITS of each entry will be preserved.
*
* @root_idx
* The 0-based index of the root node in 'A', and consequently one
* less than the number of tree node entries in 'A'. (Or, really 2
* less than the actual length of 'A'.)
*
* @len_counts
* An array of length ('max_codeword_len' + 1) in which the number of
* codewords having each length <= max_codeword_len will be
* returned.
*
* @max_codeword_len
* The maximum permissible codeword length.
*/
static void
compute_length_counts(u32 A[restrict], unsigned root_idx,
unsigned len_counts[restrict], unsigned max_codeword_len)
{
/* The key observations are:
*
* (1) We can traverse the non-leaf nodes of the tree, always
* visiting a parent before its children, by simply iterating
* through the array in reverse order. Consequently, we can
* compute the depth of each node in one pass, overwriting the
* parent indices with depths.
*
* (2) We can initially assume that in the real Huffman tree,
* both children of the root are leaves. This corresponds to two
* codewords of length 1. Then, whenever we visit a (non-leaf)
* node during the traversal, we modify this assumption to
* account for the current node *not* being a leaf, but rather
* its two children being leaves. This causes the loss of one
* codeword for the current depth and the addition of two
* codewords for the current depth plus one.
*
* (3) We can handle the length-limited constraint fairly easily
* by simply using the largest length available when a depth
* exceeds max_codeword_len.
*/
for (unsigned len = 0; len <= max_codeword_len; len++)
len_counts[len] = 0;
len_counts[1] = 2;
/* Set the root node's depth to 0. */
A[root_idx] &= SYMBOL_MASK;
for (int node = root_idx - 1; node >= 0; node--) {
/* Calculate the depth of this node. */
unsigned parent = A[node] >> NUM_SYMBOL_BITS;
unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
unsigned depth = parent_depth + 1;
unsigned len = depth;
/* Set the depth of this node so that it is available
* when its children (if any) are processed. */
A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
/* If needed, decrease the length to meet the
* length-limited constraint. This is not the optimal
* method for generating length-limited Huffman codes!
* But it should be good enough. */
if (len >= max_codeword_len) {
len = max_codeword_len;
do {
len--;
} while (len_counts[len] == 0);
}
/* Account for the fact that we have a non-leaf node at
* the current depth. */
len_counts[len]--;
len_counts[len + 1] += 2;
}
}
/*
* Generate the codewords for a canonical Huffman code.
*
* @A
* The output array for codewords. In addition, initially this
* array must contain the symbols, sorted primarily by frequency and
* secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
* each entry.
*
* @len
* Output array for codeword lengths.
*
* @len_counts
* An array that provides the number of codewords that will have
* each possible length <= max_codeword_len.
*
* @max_codeword_len
* Maximum length, in bits, of each codeword.
*
* @num_syms
* Number of symbols in the alphabet, including symbols with zero
* frequency. This is the length of the 'A' and 'len' arrays.
*/
static void
gen_codewords(u32 A[restrict], u8 lens[restrict],
const unsigned len_counts[restrict],
unsigned max_codeword_len, unsigned num_syms)
{
u32 next_codewords[max_codeword_len + 1];
/* Given the number of codewords that will have each length,
* assign codeword lengths to symbols. We do this by assigning
* the lengths in decreasing order to the symbols sorted
* primarily by increasing frequency and secondarily by
* increasing symbol value. */
for (unsigned i = 0, len = max_codeword_len; len >= 1; len--) {
unsigned count = len_counts[len];
while (count--)
lens[A[i++] & SYMBOL_MASK] = len;
}
/* Generate the codewords themselves. We initialize the
* 'next_codewords' array to provide the lexicographically first
* codeword of each length, then assign codewords in symbol
* order. This produces a canonical code. */
next_codewords[0] = 0;
next_codewords[1] = 0;
for (unsigned len = 2; len <= max_codeword_len; len++)
next_codewords[len] =
(next_codewords[len - 1] + len_counts[len - 1]) << 1;
for (unsigned sym = 0; sym < num_syms; sym++)
A[sym] = next_codewords[lens[sym]]++;
}
/*
* ---------------------------------------------------------------------
* make_canonical_huffman_code()
* ---------------------------------------------------------------------
*
* Given an alphabet and the frequency of each symbol in it, construct a
* length-limited canonical Huffman code.
*
* @num_syms
* The number of symbols in the alphabet. The symbols are the
* integers in the range [0, num_syms - 1]. This parameter must be
* at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
*
* @max_codeword_len
* The maximum permissible codeword length.
*
* @freqs
* An array of @num_syms entries, each of which specifies the
* frequency of the corresponding symbol. It is valid for some,
* none, or all of the frequencies to be 0.
*
* @lens
* An array of @num_syms entries in which this function will return
* the length, in bits, of the codeword assigned to each symbol.
* Symbols with 0 frequency will not have codewords per se, but
* their entries in this array will be set to 0. No lengths greater
* than @max_codeword_len will be assigned.
*
* @codewords
* An array of @num_syms entries in which this function will return
* the codeword for each symbol, right-justified and padded on the
* left with zeroes. Codewords for symbols with 0 frequency will be
* undefined.
*
* ---------------------------------------------------------------------
*
* This function builds a length-limited canonical Huffman code.
*
* A length-limited Huffman code contains no codewords longer than some
* specified length, and has exactly (with some algorithms) or
* approximately (with the algorithm used here) the minimum weighted path
* length from the root, given this constraint.
*
* A canonical Huffman code satisfies the properties that a longer
* codeword never lexicographically precedes a shorter codeword, and the
* lexicographic ordering of codewords of the same length is the same as
* the lexicographic ordering of the corresponding symbols. A canonical
* Huffman code, or more generally a canonical prefix code, can be
* reconstructed from only a list containing the codeword length of each
* symbol.
*
* The classic algorithm to generate a Huffman code creates a node for
* each symbol, then inserts these nodes into a min-heap keyed by symbol
* frequency. Then, repeatedly, the two lowest-frequency nodes are
* removed from the min-heap and added as the children of a new node
* having frequency equal to the sum of its two children, which is then
* inserted into the min-heap. When only a single node remains in the
* min-heap, it is the root of the Huffman tree. The codeword for each
* symbol is determined by the path needed to reach the corresponding
* node from the root. Descending to the left child appends a 0 bit,
* whereas descending to the right child appends a 1 bit.
*
* The classic algorithm is relatively easy to understand, but it is
* subject to a number of inefficiencies. In practice, it is fastest to
* first sort the symbols by frequency. (This itself can be subject to
* an optimization based on the fact that most frequencies tend to be
* low.) At the same time, we sort secondarily by symbol value, which
* aids the process of generating a canonical code. Then, during tree
* construction, no heap is necessary because both the leaf nodes and the
* unparented non-leaf nodes can be easily maintained in sorted order.
* Consequently, there can never be more than two possibilities for the
* next-lowest-frequency node.
*
* In addition, because we're generating a canonical code, we actually
* don't need the leaf nodes of the tree at all, only the non-leaf nodes.
* This is because for canonical code generation we don't need to know
* where the symbols are in the tree. Rather, we only need to know how
* many leaf nodes have each depth (codeword length). And this
* information can, in fact, be quickly generated from the tree of
* non-leaves only.
*
* Furthermore, we can build this stripped-down Huffman tree directly in
* the array in which the codewords are to be generated, provided that
* these array slots are large enough to hold a symbol and frequency
* value.
*
* Still furthermore, we don't even need to maintain explicit child
* pointers. We only need the parent pointers, and even those can be
* overwritten in-place with depth information as part of the process of
* extracting codeword lengths from the tree. So in summary, we do NOT
* need a big structure like:
*
* struct huffman_tree_node {
* unsigned int symbol;
* unsigned int frequency;
* unsigned int depth;
* struct huffman_tree_node *left_child;
* struct huffman_tree_node *right_child;
* };
*
*
* ... which often gets used in "naive" implementations of Huffman code
* generation.
*
* Most of these optimizations are based on the implementation in 7-Zip
* (source file: C/HuffEnc.c), which has been placed in the public domain
* by Igor Pavlov. But I've rewritten the code with extensive comments,
* as it took me a while to figure out what it was doing...!
*
* ---------------------------------------------------------------------
*
* NOTE: in general, the same frequencies can be used to generate
* different length-limited canonical Huffman codes. One choice we have
* is during tree construction, when we must decide whether to prefer a
* leaf or non-leaf when there is a tie in frequency. Another choice we
* have is how to deal with codewords that would exceed @max_codeword_len
* bits in length. Both of these choices affect the resulting codeword
* lengths, which otherwise can be mapped uniquely onto the resulting
* canonical Huffman code.
*
* Normally, there is no problem with choosing one valid code over
* another, provided that they produce similar compression ratios.
* However, the LZMS compression format uses adaptive Huffman coding. It
* requires that both the decompressor and compressor build a canonical
* code equivalent to that which can be generated by using the classic
* Huffman tree construction algorithm and always processing leaves
* before non-leaves when there is a frequency tie. Therefore, we make
* sure to do this. This method also has the advantage of sometimes
* shortening the longest codeword that is generated.
*
* There also is the issue of how codewords longer than @max_codeword_len
* are dealt with. Fortunately, for LZMS this is irrelevant because
* because for the LZMS alphabets no codeword can ever exceed
* LZMS_MAX_CODEWORD_LEN (= 15). Since the LZMS algorithm regularly
* halves all frequencies, the frequencies cannot become high enough for
* a length 16 codeword to be generated. Specifically, I think that if
* ties are broken in favor of non-leaves (as we do), the lowest total
* frequency that would give a length-16 codeword would be the sum of the
* frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which
* is 3570. And in LZMS we can't get a frequency that high based on the
* alphabet sizes, rebuild frequencies, and scaling factors. This
* worst-case scenario is based on the following degenerate case (only
* the bottom of the tree shown):
*
* ...
* 17
* / \
* 10 7
* / \
* 6 4
* / \
* 3 3
* / \
* 2 1
* / \
* 1 1
*
* Excluding the first leaves (those with value 1), each leaf value must
* be greater than the non-leaf up 1 and down 2 from it; otherwise that
* leaf would have taken precedence over that non-leaf and been combined
* with the leaf below, thereby decreasing the height compared to that
* shown.
*
* Interesting fact: if we were to instead prioritize non-leaves over
* leaves, then the worst case frequencies would be the Fibonacci
* sequence, plus an extra frequency of 1. In this hypothetical
* scenario, it would be slightly easier for longer codewords to be
* generated.
*/
void
make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
const u32 freqs[restrict],
u8 lens[restrict], u32 codewords[restrict])
{
u32 *A = codewords;
unsigned num_used_syms;
/* We begin by sorting the symbols primarily by frequency and
* secondarily by symbol value. As an optimization, the array
* used for this purpose ('A') shares storage with the space in
* which we will eventually return the codewords. */
num_used_syms = sort_symbols(num_syms, freqs, lens, A);
/* 'num_used_syms' is the number of symbols with nonzero
* frequency. This may be less than @num_syms. 'num_used_syms'
* is also the number of entries in 'A' that are valid. Each
* entry consists of a distinct symbol and a nonzero frequency
* packed into a 32-bit integer. */
/* Handle special cases where only 0 or 1 symbols were used (had
* nonzero frequency). */
if (unlikely(num_used_syms == 0)) {
/* Code is empty. sort_symbols() already set all lengths
* to 0, so there is nothing more to do. */
return;
}
if (unlikely(num_used_syms == 1)) {
/* Only one symbol was used, so we only need one
* codeword. But two codewords are needed to form the
* smallest complete Huffman code, which uses codewords 0
* and 1. Therefore, we choose another symbol to which
* to assign a codeword. We use 0 (if the used symbol is
* not 0) or 1 (if the used symbol is 0). In either
* case, the lesser-valued symbol must be assigned
* codeword 0 so that the resulting code is canonical. */
unsigned sym = A[0] & SYMBOL_MASK;
unsigned nonzero_idx = sym ? sym : 1;
codewords[0] = 0;
lens[0] = 1;
codewords[nonzero_idx] = 1;
lens[nonzero_idx] = 1;
return;
}
/* Build a stripped-down version of the Huffman tree, sharing the
* array 'A' with the symbol values. Then extract length counts
* from the tree and use them to generate the final codewords. */
build_tree(A, num_used_syms);
{
unsigned len_counts[max_codeword_len + 1];
compute_length_counts(A, num_used_syms - 2,
len_counts, max_codeword_len);
gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
}
}