From f2aeb875aa7e73a486c6b4f8bdf4952e49ec8b8e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 30 Jul 2022 19:03:42 -0700 Subject: [PATCH] compress_common: sync with libdeflate --- include/wimlib/compiler.h | 3 + include/wimlib/compress_common.h | 12 +- src/compress_common.c | 698 ++++++++++++++++--------------- src/divsufsort.c | 3 - 4 files changed, 366 insertions(+), 350 deletions(-) diff --git a/include/wimlib/compiler.h b/include/wimlib/compiler.h index 0eacffd8..294108c3 100644 --- a/include/wimlib/compiler.h +++ b/include/wimlib/compiler.h @@ -149,17 +149,20 @@ # define min(a, b) ({ typeof(a) _a = (a); typeof(b) _b = (b); \ (_a < _b) ? _a : _b; }) #endif +#define MIN(a, b) min((a), (b)) /* Get the maximum of two variables, without multiple evaluation. */ #ifndef max # define max(a, b) ({ typeof(a) _a = (a); typeof(b) _b = (b); \ (_a > _b) ? _a : _b; }) #endif +#define MAX(a, b) max((a), (b)) /* Swap the values of two variables, without multiple evaluation. */ #ifndef swap # define swap(a, b) ({ typeof(a) _a = (a); (a) = (b); (b) = _a; }) #endif +#define SWAP(a, b) swap((a), (b)) /* (Optional) Efficiently swap the bytes of a 16-bit integer. */ #if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) diff --git a/include/wimlib/compress_common.h b/include/wimlib/compress_common.h index 7913a138..f2ed4109 100644 --- a/include/wimlib/compress_common.h +++ b/include/wimlib/compress_common.h @@ -9,11 +9,11 @@ #include "wimlib/types.h" -extern void -make_canonical_huffman_code(unsigned num_syms, - unsigned max_codeword_len, - const u32 freq_tab[restrict], - u8 lens[restrict], - u32 codewords[restrict]); +#define MAX_NUM_SYMS 799 /* LZMS_MAX_NUM_SYMS */ +#define MAX_CODEWORD_LEN 16 + +void +make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len, + const u32 freqs[], u8 lens[], u32 codewords[]); #endif /* _WIMLIB_COMPRESS_COMMON_H */ diff --git a/src/compress_common.c b/src/compress_common.c index ac02e0cf..0e2ba05a 100644 --- a/src/compress_common.c +++ b/src/compress_common.c @@ -33,13 +33,16 @@ #include +#include "wimlib/assert.h" #include "wimlib/compress_common.h" #include "wimlib/util.h" -/* Given the binary tree node A[subtree_idx] whose children already - * satisfy the maxheap property, swap the node with its greater child - * until it is greater than both its children, so that the maxheap - * property is satisfied in the subtree rooted at A[subtree_idx]. */ +/* + * Given the binary tree node A[subtree_idx] whose children already satisfy the + * maxheap property, swap the node with its greater child until it is greater + * than or equal to both of its children, so that the maxheap property is + * satisfied in the subtree rooted at A[subtree_idx]. 'A' uses 1-based indices. + */ static void heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) { @@ -60,297 +63,306 @@ heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) A[parent_idx] = v; } -/* Rearrange the array 'A' so that it satisfies the maxheap property. +/* + * Rearrange the array 'A' so that it satisfies the maxheap property. * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1]. */ static void heapify_array(u32 A[], unsigned length) { - for (unsigned subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) + unsigned subtree_idx; + + for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) heapify_subtree(A, length, subtree_idx); } -/* Sort the array 'A', which contains 'length' unsigned 32-bit integers. */ +/* + * Sort the array 'A', which contains 'length' unsigned 32-bit integers. + * + * Note: name this function heap_sort() instead of heapsort() to avoid colliding + * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't + * necessary when compiling with -D_ANSI_SOURCE, which is the better solution. + */ static void -heapsort(u32 A[], unsigned length) +heap_sort(u32 A[], unsigned length) { A--; /* Use 1-based indices */ heapify_array(A, length); while (length >= 2) { - swap(A[1], A[length]); + u32 tmp = A[length]; + + A[length] = A[1]; + A[1] = tmp; length--; heapify_subtree(A, length, 1); } } #define NUM_SYMBOL_BITS 10 -#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) +#define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS) +#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) +#define FREQ_MASK (~SYMBOL_MASK) + +#define GET_NUM_COUNTERS(num_syms) (num_syms) /* - * Sort the symbols primarily by frequency and secondarily by symbol - * value. Discard symbols with zero frequency and fill in an array with - * the remaining symbols, along with their frequencies. The low - * NUM_SYMBOL_BITS bits of each array entry will contain the symbol - * value, and the remaining bits will contain the frequency. + * Sort the symbols primarily by frequency and secondarily by symbol value. + * Discard symbols with zero frequency and fill in an array with the remaining + * symbols, along with their frequencies. The low NUM_SYMBOL_BITS bits of each + * array entry will contain the symbol value, and the remaining bits will + * contain the frequency. * * @num_syms - * Number of symbols in the alphabet. - * Can't be greater than (1 << NUM_SYMBOL_BITS). + * Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS. * * @freqs[num_syms] - * The frequency of each symbol. + * Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1. * * @lens[num_syms] - * An array that eventually will hold the length of each codeword. - * This function only fills in the codeword lengths for symbols that - * have zero frequency, which are not well defined per se but will - * be set to 0. + * An array that eventually will hold the length of each codeword. This + * function only fills in the codeword lengths for symbols that have zero + * frequency, which are not well defined per se but will be set to 0. * * @symout[num_syms] * The output array, described above. * - * Returns the number of entries in 'symout' that were filled. This is - * the number of symbols that have nonzero frequency. + * Returns the number of entries in 'symout' that were filled. This is the + * number of symbols that have nonzero frequency. */ static unsigned -sort_symbols(unsigned num_syms, const u32 freqs[restrict], - u8 lens[restrict], u32 symout[restrict]) +sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[]) { + unsigned sym; + unsigned i; unsigned num_used_syms; unsigned num_counters; + unsigned counters[GET_NUM_COUNTERS(MAX_NUM_SYMS)]; - /* We rely on heapsort, but with an added optimization. Since - * it's common for most symbol frequencies to be low, we first do - * a count sort using a limited number of counters. High - * frequencies will be counted in the last counter, and only they - * will be sorted with heapsort. + /* + * We use heapsort, but with an added optimization. Since often most + * symbol frequencies are low, we first do a count sort using a limited + * number of counters. High frequencies are counted in the last + * counter, and only they will be sorted with heapsort. * * Note: with more symbols, it is generally beneficial to have more - * counters. About 1 counter per 4 symbols seems fast. - * - * Note: I also tested radix sort, but even for large symbol - * counts (> 255) and frequencies bounded at 16 bits (enabling - * radix sort by just two base-256 digits), it didn't seem any - * faster than the method implemented here. - * - * Note: I tested the optimized quicksort implementation from - * glibc (with indirection overhead removed), but it was only - * marginally faster than the simple heapsort implemented here. - * - * Tests were done with building the codes for LZX. Results may - * vary for different compression algorithms...! */ - - num_counters = ALIGN(DIV_ROUND_UP(num_syms, 4), 4); + * counters. About 1 counter per symbol seems fastest. + */ - unsigned counters[num_counters]; + num_counters = GET_NUM_COUNTERS(num_syms); - memset(counters, 0, sizeof(counters)); + memset(counters, 0, num_counters * sizeof(counters[0])); - /* Count the frequencies. */ - for (unsigned sym = 0; sym < num_syms; sym++) - counters[min(freqs[sym], num_counters - 1)]++; + /* Count the frequencies. */ + for (sym = 0; sym < num_syms; sym++) + counters[MIN(freqs[sym], num_counters - 1)]++; - /* Make the counters cumulative, ignoring the zero-th, which - * counted symbols with zero frequency. As a side effect, this - * calculates the number of symbols with nonzero frequency. */ + /* + * Make the counters cumulative, ignoring the zero-th, which counted + * symbols with zero frequency. As a side effect, this calculates the + * number of symbols with nonzero frequency. + */ num_used_syms = 0; - for (unsigned i = 1; i < num_counters; i++) { + for (i = 1; i < num_counters; i++) { unsigned count = counters[i]; + counters[i] = num_used_syms; num_used_syms += count; } - /* Sort nonzero-frequency symbols using the counters. At the - * same time, set the codeword lengths of zero-frequency symbols - * to 0. */ - for (unsigned sym = 0; sym < num_syms; sym++) { + /* + * Sort nonzero-frequency symbols using the counters. At the same time, + * set the codeword lengths of zero-frequency symbols to 0. + */ + for (sym = 0; sym < num_syms; sym++) { u32 freq = freqs[sym]; + if (freq != 0) { - symout[counters[min(freq, num_counters - 1)]++] = + symout[counters[MIN(freq, num_counters - 1)]++] = sym | (freq << NUM_SYMBOL_BITS); } else { lens[sym] = 0; } } - /* Sort the symbols counted in the last counter. */ - heapsort(symout + counters[num_counters - 2], - counters[num_counters - 1] - counters[num_counters - 2]); + /* Sort the symbols counted in the last counter. */ + heap_sort(symout + counters[num_counters - 2], + counters[num_counters - 1] - counters[num_counters - 2]); return num_used_syms; } /* - * Build the Huffman tree. + * Build a Huffman tree. * * This is an optimized implementation that * (a) takes advantage of the frequencies being already sorted; - * (b) only generates non-leaf nodes, since the non-leaf nodes of a - * Huffman tree are sufficient to generate a canonical code; + * (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman + * tree are sufficient to generate a canonical code; * (c) Only stores parent pointers, not child pointers; - * (d) Produces the nodes in the same memory used for input - * frequency information. - * - * Array 'A', which contains 'sym_count' entries, is used for both input - * and output. For this function, 'sym_count' must be at least 2. - * - * For input, the array must contain the frequencies of the symbols, - * sorted in increasing order. Specifically, each entry must contain a - * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low - * NUM_SYMBOL_BITS bits of the entries will be ignored by this function. - * Although these bits will, in fact, contain the symbols that correspond - * to the frequencies, this function is concerned with frequencies only - * and keeps the symbols as-is. - * - * For output, this function will produce the non-leaf nodes of the - * Huffman tree. These nodes will be stored in the first (sym_count - 1) - * entries of the array. Entry A[sym_count - 2] will represent the root - * node. Each other node will contain the zero-based index of its parent - * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low - * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again, - * note that although these low bits will, in fact, contain a symbol - * value, this symbol will have *no relationship* with the Huffman tree - * node that happens to occupy the same slot. This is because this + * (d) Produces the nodes in the same memory used for input frequency + * information. + * + * Array 'A', which contains 'sym_count' entries, is used for both input and + * output. For this function, 'sym_count' must be at least 2. + * + * For input, the array must contain the frequencies of the symbols, sorted in + * increasing order. Specifically, each entry must contain a frequency left + * shifted by NUM_SYMBOL_BITS bits. Any data in the low NUM_SYMBOL_BITS bits of + * the entries will be ignored by this function. Although these bits will, in + * fact, contain the symbols that correspond to the frequencies, this function + * is concerned with frequencies only and keeps the symbols as-is. + * + * For output, this function will produce the non-leaf nodes of the Huffman + * tree. These nodes will be stored in the first (sym_count - 1) entries of the + * array. Entry A[sym_count - 2] will represent the root node. Each other node + * will contain the zero-based index of its parent node in 'A', left shifted by + * NUM_SYMBOL_BITS bits. The low NUM_SYMBOL_BITS bits of each entry in A will + * be kept as-is. Again, note that although these low bits will, in fact, + * contain a symbol value, this symbol will have *no relationship* with the + * Huffman tree node that happens to occupy the same slot. This is because this * implementation only generates the non-leaf nodes of the tree. */ static void build_tree(u32 A[], unsigned sym_count) { - /* Index, in 'A', of next lowest frequency symbol that has not - * yet been processed. */ + const unsigned last_idx = sym_count - 1; + + /* Index of the next lowest frequency leaf that still needs a parent */ unsigned i = 0; - /* Index, in 'A', of next lowest frequency parentless non-leaf - * node; or, if equal to 'e', then no such node exists yet. */ + /* + * Index of the next lowest frequency non-leaf that still needs a + * parent, or 'e' if there is currently no such node + */ unsigned b = 0; - /* Index, in 'A', of next node to allocate as a non-leaf. */ + /* Index of the next spot for a non-leaf (will overwrite a leaf) */ unsigned e = 0; do { - unsigned m, n; - u32 freq_shifted; + u32 new_freq; - /* Choose the two next lowest frequency entries. */ - - if (i != sym_count && - (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) - m = i++; - else - m = b++; - - if (i != sym_count && - (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) - n = i++; - else - n = b++; - - /* Allocate a non-leaf node and link the entries to it. + /* + * Select the next two lowest frequency nodes among the leaves + * A[i] and non-leaves A[b], and create a new node A[e] to be + * their parent. Set the new node's frequency to the sum of the + * frequencies of its two children. * - * If we link an entry that we're visiting for the first - * time (via index 'i'), then we're actually linking a - * leaf node and it will have no effect, since the leaf - * will be overwritten with a non-leaf when index 'e' - * catches up to it. But it's not any slower to - * unconditionally set the parent index. - * - * We also compute the frequency of the non-leaf node as - * the sum of its two children's frequencies. */ - - freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK); - - A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); - A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); - A[e] = (A[e] & SYMBOL_MASK) | freq_shifted; - e++; - } while (sym_count - e > 1); - /* When just one entry remains, it is a "leaf" that was - * linked to some other node. We ignore it, since the - * rest of the array contains the non-leaves which we - * need. (Note that we're assuming the cases with 0 or 1 - * symbols were handled separately.) */ + * Usually the next two lowest frequency nodes are of the same + * type (leaf or non-leaf), so check those cases first. + */ + if (i + 1 <= last_idx && + (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) { + /* Two leaves */ + new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK); + i += 2; + } else if (b + 2 <= e && + (i > last_idx || + (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) { + /* Two non-leaves */ + new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK); + A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); + A[b + 1] = (e << NUM_SYMBOL_BITS) | + (A[b + 1] & SYMBOL_MASK); + b += 2; + } else { + /* One leaf and one non-leaf */ + new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK); + A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); + i++; + b++; + } + A[e] = new_freq | (A[e] & SYMBOL_MASK); + /* + * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the + * tree is complete once we've created 'n - 1' non-leaves. + */ + } while (++e < last_idx); } /* - * Given the stripped-down Huffman tree constructed by build_tree(), - * determine the number of codewords that should be assigned each - * possible length, taking into account the length-limited constraint. + * Given the stripped-down Huffman tree constructed by build_tree(), determine + * the number of codewords that should be assigned each possible length, taking + * into account the length-limited constraint. * * @A - * The array produced by build_tree(), containing parent index - * information for the non-leaf nodes of the Huffman tree. Each - * entry in this array is a node; a node's parent always has a - * greater index than that node itself. This function will - * overwrite the parent index information in this array, so - * essentially it will destroy the tree. However, the data in the - * low NUM_SYMBOL_BITS of each entry will be preserved. + * The array produced by build_tree(), containing parent index information + * for the non-leaf nodes of the Huffman tree. Each entry in this array is + * a node; a node's parent always has a greater index than that node + * itself. This function will overwrite the parent index information in + * this array, so essentially it will destroy the tree. However, the data + * in the low NUM_SYMBOL_BITS of each entry will be preserved. * * @root_idx - * The 0-based index of the root node in 'A', and consequently one - * less than the number of tree node entries in 'A'. (Or, really 2 - * less than the actual length of 'A'.) + * The 0-based index of the root node in 'A', and consequently one less + * than the number of tree node entries in 'A'. (Or, really 2 less than + * the actual length of 'A'.) * * @len_counts * An array of length ('max_codeword_len' + 1) in which the number of - * codewords having each length <= max_codeword_len will be - * returned. + * codewords having each length <= max_codeword_len will be returned. * * @max_codeword_len * The maximum permissible codeword length. */ static void -compute_length_counts(u32 A[restrict], unsigned root_idx, - unsigned len_counts[restrict], unsigned max_codeword_len) +compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[], + unsigned max_codeword_len) { - /* The key observations are: + unsigned len; + int node; + + /* + * The key observations are: * - * (1) We can traverse the non-leaf nodes of the tree, always - * visiting a parent before its children, by simply iterating - * through the array in reverse order. Consequently, we can - * compute the depth of each node in one pass, overwriting the - * parent indices with depths. + * (1) We can traverse the non-leaf nodes of the tree, always visiting a + * parent before its children, by simply iterating through the array + * in reverse order. Consequently, we can compute the depth of each + * node in one pass, overwriting the parent indices with depths. * - * (2) We can initially assume that in the real Huffman tree, - * both children of the root are leaves. This corresponds to two - * codewords of length 1. Then, whenever we visit a (non-leaf) - * node during the traversal, we modify this assumption to - * account for the current node *not* being a leaf, but rather - * its two children being leaves. This causes the loss of one - * codeword for the current depth and the addition of two - * codewords for the current depth plus one. + * (2) We can initially assume that in the real Huffman tree, both + * children of the root are leaves. This corresponds to two + * codewords of length 1. Then, whenever we visit a (non-leaf) node + * during the traversal, we modify this assumption to account for + * the current node *not* being a leaf, but rather its two children + * being leaves. This causes the loss of one codeword for the + * current depth and the addition of two codewords for the current + * depth plus one. * - * (3) We can handle the length-limited constraint fairly easily - * by simply using the largest length available when a depth - * exceeds max_codeword_len. + * (3) We can handle the length-limited constraint fairly easily by + * simply using the largest length available when a depth exceeds + * max_codeword_len. */ - for (unsigned len = 0; len <= max_codeword_len; len++) + for (len = 0; len <= max_codeword_len; len++) len_counts[len] = 0; len_counts[1] = 2; - /* Set the root node's depth to 0. */ + /* Set the root node's depth to 0. */ A[root_idx] &= SYMBOL_MASK; - for (int node = root_idx - 1; node >= 0; node--) { + for (node = root_idx - 1; node >= 0; node--) { - /* Calculate the depth of this node. */ + /* Calculate the depth of this node. */ unsigned parent = A[node] >> NUM_SYMBOL_BITS; unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; unsigned depth = parent_depth + 1; unsigned len = depth; - /* Set the depth of this node so that it is available - * when its children (if any) are processed. */ - + /* + * Set the depth of this node so that it is available when its + * children (if any) are processed. + */ A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); - /* If needed, decrease the length to meet the - * length-limited constraint. This is not the optimal - * method for generating length-limited Huffman codes! - * But it should be good enough. */ + /* + * If needed, decrease the length to meet the length-limited + * constraint. This is not the optimal method for generating + * length-limited Huffman codes! But it should be good enough. + */ if (len >= max_codeword_len) { len = max_codeword_len; do { @@ -358,8 +370,10 @@ compute_length_counts(u32 A[restrict], unsigned root_idx, } while (len_counts[len] == 0); } - /* Account for the fact that we have a non-leaf node at - * the current depth. */ + /* + * Account for the fact that we have a non-leaf node at the + * current depth. + */ len_counts[len]--; len_counts[len + 1] += 2; } @@ -389,34 +403,40 @@ compute_length_counts(u32 A[restrict], unsigned root_idx, * frequency. This is the length of the 'A' and 'len' arrays. */ static void -gen_codewords(u32 A[restrict], u8 lens[restrict], - const unsigned len_counts[restrict], +gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[], unsigned max_codeword_len, unsigned num_syms) { - u32 next_codewords[max_codeword_len + 1]; - - /* Given the number of codewords that will have each length, - * assign codeword lengths to symbols. We do this by assigning - * the lengths in decreasing order to the symbols sorted - * primarily by increasing frequency and secondarily by - * increasing symbol value. */ - for (unsigned i = 0, len = max_codeword_len; len >= 1; len--) { + u32 next_codewords[MAX_CODEWORD_LEN + 1]; + unsigned i; + unsigned len; + unsigned sym; + + /* + * Given the number of codewords that will have each length, assign + * codeword lengths to symbols. We do this by assigning the lengths in + * decreasing order to the symbols sorted primarily by increasing + * frequency and secondarily by increasing symbol value. + */ + for (i = 0, len = max_codeword_len; len >= 1; len--) { unsigned count = len_counts[len]; + while (count--) lens[A[i++] & SYMBOL_MASK] = len; } - /* Generate the codewords themselves. We initialize the + /* + * Generate the codewords themselves. We initialize the * 'next_codewords' array to provide the lexicographically first - * codeword of each length, then assign codewords in symbol - * order. This produces a canonical code. */ + * codeword of each length, then assign codewords in symbol order. This + * produces a canonical code. + */ next_codewords[0] = 0; next_codewords[1] = 0; - for (unsigned len = 2; len <= max_codeword_len; len++) + for (len = 2; len <= max_codeword_len; len++) next_codewords[len] = (next_codewords[len - 1] + len_counts[len - 1]) << 1; - for (unsigned sym = 0; sym < num_syms; sym++) + for (sym = 0; sym < num_syms; sym++) A[sym] = next_codewords[lens[sym]]++; } @@ -429,88 +449,81 @@ gen_codewords(u32 A[restrict], u8 lens[restrict], * length-limited canonical Huffman code. * * @num_syms - * The number of symbols in the alphabet. The symbols are the - * integers in the range [0, num_syms - 1]. This parameter must be - * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS). + * The number of symbols in the alphabet. The symbols are the integers in + * the range [0, num_syms - 1]. This parameter must be at least 2 and + * must not exceed (1 << NUM_SYMBOL_BITS). * * @max_codeword_len * The maximum permissible codeword length. * * @freqs - * An array of @num_syms entries, each of which specifies the - * frequency of the corresponding symbol. It is valid for some, - * none, or all of the frequencies to be 0. + * An array of length @num_syms that gives the frequency of each symbol. + * It is valid for some, none, or all of the frequencies to be 0. The sum + * of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1. * * @lens - * An array of @num_syms entries in which this function will return - * the length, in bits, of the codeword assigned to each symbol. - * Symbols with 0 frequency will not have codewords per se, but - * their entries in this array will be set to 0. No lengths greater - * than @max_codeword_len will be assigned. + * An array of @num_syms entries in which this function will return the + * length, in bits, of the codeword assigned to each symbol. Symbols with + * 0 frequency will not have codewords per se, but their entries in this + * array will be set to 0. No lengths greater than @max_codeword_len will + * be assigned. * * @codewords - * An array of @num_syms entries in which this function will return - * the codeword for each symbol, right-justified and padded on the - * left with zeroes. Codewords for symbols with 0 frequency will be - * undefined. + * An array of @num_syms entries in which this function will return the + * codeword for each symbol, right-justified and padded on the left with + * zeroes. Codewords for symbols with 0 frequency will be undefined. * * --------------------------------------------------------------------- * * This function builds a length-limited canonical Huffman code. * * A length-limited Huffman code contains no codewords longer than some - * specified length, and has exactly (with some algorithms) or - * approximately (with the algorithm used here) the minimum weighted path - * length from the root, given this constraint. - * - * A canonical Huffman code satisfies the properties that a longer - * codeword never lexicographically precedes a shorter codeword, and the - * lexicographic ordering of codewords of the same length is the same as - * the lexicographic ordering of the corresponding symbols. A canonical - * Huffman code, or more generally a canonical prefix code, can be - * reconstructed from only a list containing the codeword length of each - * symbol. - * - * The classic algorithm to generate a Huffman code creates a node for - * each symbol, then inserts these nodes into a min-heap keyed by symbol - * frequency. Then, repeatedly, the two lowest-frequency nodes are - * removed from the min-heap and added as the children of a new node - * having frequency equal to the sum of its two children, which is then - * inserted into the min-heap. When only a single node remains in the - * min-heap, it is the root of the Huffman tree. The codeword for each - * symbol is determined by the path needed to reach the corresponding - * node from the root. Descending to the left child appends a 0 bit, - * whereas descending to the right child appends a 1 bit. - * - * The classic algorithm is relatively easy to understand, but it is - * subject to a number of inefficiencies. In practice, it is fastest to - * first sort the symbols by frequency. (This itself can be subject to - * an optimization based on the fact that most frequencies tend to be - * low.) At the same time, we sort secondarily by symbol value, which - * aids the process of generating a canonical code. Then, during tree - * construction, no heap is necessary because both the leaf nodes and the - * unparented non-leaf nodes can be easily maintained in sorted order. - * Consequently, there can never be more than two possibilities for the - * next-lowest-frequency node. - * - * In addition, because we're generating a canonical code, we actually - * don't need the leaf nodes of the tree at all, only the non-leaf nodes. - * This is because for canonical code generation we don't need to know - * where the symbols are in the tree. Rather, we only need to know how - * many leaf nodes have each depth (codeword length). And this - * information can, in fact, be quickly generated from the tree of - * non-leaves only. - * - * Furthermore, we can build this stripped-down Huffman tree directly in - * the array in which the codewords are to be generated, provided that - * these array slots are large enough to hold a symbol and frequency - * value. - * - * Still furthermore, we don't even need to maintain explicit child - * pointers. We only need the parent pointers, and even those can be - * overwritten in-place with depth information as part of the process of - * extracting codeword lengths from the tree. So in summary, we do NOT - * need a big structure like: + * specified length, and has exactly (with some algorithms) or approximately + * (with the algorithm used here) the minimum weighted path length from the + * root, given this constraint. + * + * A canonical Huffman code satisfies the properties that a longer codeword + * never lexicographically precedes a shorter codeword, and the lexicographic + * ordering of codewords of the same length is the same as the lexicographic + * ordering of the corresponding symbols. A canonical Huffman code, or more + * generally a canonical prefix code, can be reconstructed from only a list + * containing the codeword length of each symbol. + * + * The classic algorithm to generate a Huffman code creates a node for each + * symbol, then inserts these nodes into a min-heap keyed by symbol frequency. + * Then, repeatedly, the two lowest-frequency nodes are removed from the + * min-heap and added as the children of a new node having frequency equal to + * the sum of its two children, which is then inserted into the min-heap. When + * only a single node remains in the min-heap, it is the root of the Huffman + * tree. The codeword for each symbol is determined by the path needed to reach + * the corresponding node from the root. Descending to the left child appends a + * 0 bit, whereas descending to the right child appends a 1 bit. + * + * The classic algorithm is relatively easy to understand, but it is subject to + * a number of inefficiencies. In practice, it is fastest to first sort the + * symbols by frequency. (This itself can be subject to an optimization based + * on the fact that most frequencies tend to be low.) At the same time, we sort + * secondarily by symbol value, which aids the process of generating a canonical + * code. Then, during tree construction, no heap is necessary because both the + * leaf nodes and the unparented non-leaf nodes can be easily maintained in + * sorted order. Consequently, there can never be more than two possibilities + * for the next-lowest-frequency node. + * + * In addition, because we're generating a canonical code, we actually don't + * need the leaf nodes of the tree at all, only the non-leaf nodes. This is + * because for canonical code generation we don't need to know where the symbols + * are in the tree. Rather, we only need to know how many leaf nodes have each + * depth (codeword length). And this information can, in fact, be quickly + * generated from the tree of non-leaves only. + * + * Furthermore, we can build this stripped-down Huffman tree directly in the + * array in which the codewords are to be generated, provided that these array + * slots are large enough to hold a symbol and frequency value. + * + * Still furthermore, we don't even need to maintain explicit child pointers. + * We only need the parent pointers, and even those can be overwritten in-place + * with depth information as part of the process of extracting codeword lengths + * from the tree. So in summary, we do NOT need a big structure like: * * struct huffman_tree_node { * unsigned int symbol; @@ -521,47 +534,40 @@ gen_codewords(u32 A[restrict], u8 lens[restrict], * }; * * - * ... which often gets used in "naive" implementations of Huffman code - * generation. - * - * Most of these optimizations are based on the implementation in 7-Zip - * (source file: C/HuffEnc.c), which has been placed in the public domain - * by Igor Pavlov. But I've rewritten the code with extensive comments, - * as it took me a while to figure out what it was doing...! - * - * --------------------------------------------------------------------- - * - * NOTE: in general, the same frequencies can be used to generate - * different length-limited canonical Huffman codes. One choice we have - * is during tree construction, when we must decide whether to prefer a - * leaf or non-leaf when there is a tie in frequency. Another choice we - * have is how to deal with codewords that would exceed @max_codeword_len - * bits in length. Both of these choices affect the resulting codeword - * lengths, which otherwise can be mapped uniquely onto the resulting - * canonical Huffman code. - * - * Normally, there is no problem with choosing one valid code over - * another, provided that they produce similar compression ratios. - * However, the LZMS compression format uses adaptive Huffman coding. It - * requires that both the decompressor and compressor build a canonical - * code equivalent to that which can be generated by using the classic - * Huffman tree construction algorithm and always processing leaves - * before non-leaves when there is a frequency tie. Therefore, we make - * sure to do this. This method also has the advantage of sometimes - * shortening the longest codeword that is generated. - * - * There also is the issue of how codewords longer than @max_codeword_len - * are dealt with. Fortunately, for LZMS this is irrelevant because for - * the LZMS alphabets no codeword can ever exceed LZMS_MAX_CODEWORD_LEN - * (= 15). Since the LZMS algorithm regularly halves all frequencies, - * the frequencies cannot become high enough for a length 16 codeword to - * be generated. Specifically, I think that if ties are broken in favor - * of non-leaves (as we do), the lowest total frequency that would give a - * length-16 codeword would be the sum of the frequencies 1 1 1 3 4 7 11 - * 18 29 47 76 123 199 322 521 843 1364, which is 3570. And in LZMS we - * can't get a frequency that high based on the alphabet sizes, rebuild - * frequencies, and scaling factors. This worst-case scenario is based - * on the following degenerate case (only the bottom of the tree shown): + * ... which often gets used in "naive" implementations of Huffman code + * generation. + * + * Many of these optimizations are based on the implementation in 7-Zip (source + * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov. + * + * NOTE: in general, the same frequencies can be used to generate different + * length-limited canonical Huffman codes. One choice we have is during tree + * construction, when we must decide whether to prefer a leaf or non-leaf when + * there is a tie in frequency. Another choice we have is how to deal with + * codewords that would exceed @max_codeword_len bits in length. Both of these + * choices affect the resulting codeword lengths, which otherwise can be mapped + * uniquely onto the resulting canonical Huffman code. + * + * Normally, there is no problem with choosing one valid code over another, + * provided that they produce similar compression ratios. However, the LZMS + * compression format uses adaptive Huffman coding. It requires that both the + * decompressor and compressor build a canonical code equivalent to that which + * can be generated by using the classic Huffman tree construction algorithm and + * always processing leaves before non-leaves when there is a frequency tie. + * Therefore, we make sure to do this. This method also has the advantage of + * sometimes shortening the longest codeword that is generated. + * + * There also is the issue of how codewords longer than @max_codeword_len are + * dealt with. Fortunately, for LZMS this is irrelevant because for the LZMS + * alphabets no codeword can ever exceed LZMS_MAX_CODEWORD_LEN (= 15). Since + * the LZMS algorithm regularly halves all frequencies, the frequencies cannot + * become high enough for a length 16 codeword to be generated. Specifically, I + * think that if ties are broken in favor of non-leaves (as we do), the lowest + * total frequency that would give a length-16 codeword would be the sum of the + * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which is + * 3570. And in LZMS we can't get a frequency that high based on the alphabet + * sizes, rebuild frequencies, and scaling factors. This worst-case scenario is + * based on the following degenerate case (only the bottom of the tree shown): * * ... * 17 @@ -576,57 +582,65 @@ gen_codewords(u32 A[restrict], u8 lens[restrict], * / \ * 1 1 * - * Excluding the first leaves (those with value 1), each leaf value must - * be greater than the non-leaf up 1 and down 2 from it; otherwise that - * leaf would have taken precedence over that non-leaf and been combined - * with the leaf below, thereby decreasing the height compared to that - * shown. - * - * Interesting fact: if we were to instead prioritize non-leaves over - * leaves, then the worst case frequencies would be the Fibonacci - * sequence, plus an extra frequency of 1. In this hypothetical - * scenario, it would be slightly easier for longer codewords to be - * generated. + * Excluding the first leaves (those with value 1), each leaf value must be + * greater than the non-leaf up 1 and down 2 from it; otherwise that leaf would + * have taken precedence over that non-leaf and been combined with the leaf + * below, thereby decreasing the height compared to that shown. + * + * Interesting fact: if we were to instead prioritize non-leaves over leaves, + * then the worst case frequencies would be the Fibonacci sequence, plus an + * extra frequency of 1. In this hypothetical scenario, it would be slightly + * easier for longer codewords to be generated. */ void make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len, - const u32 freqs[restrict], - u8 lens[restrict], u32 codewords[restrict]) + const u32 freqs[], u8 lens[], u32 codewords[]) { u32 *A = codewords; unsigned num_used_syms; - /* We begin by sorting the symbols primarily by frequency and - * secondarily by symbol value. As an optimization, the array - * used for this purpose ('A') shares storage with the space in - * which we will eventually return the codewords. */ + wimlib_assert(num_syms <= MAX_NUM_SYMS); + STATIC_ASSERT(MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); + wimlib_assert(max_codeword_len <= MAX_CODEWORD_LEN); + /* + * We begin by sorting the symbols primarily by frequency and + * secondarily by symbol value. As an optimization, the array used for + * this purpose ('A') shares storage with the space in which we will + * eventually return the codewords. + */ num_used_syms = sort_symbols(num_syms, freqs, lens, A); - /* 'num_used_syms' is the number of symbols with nonzero - * frequency. This may be less than @num_syms. 'num_used_syms' - * is also the number of entries in 'A' that are valid. Each - * entry consists of a distinct symbol and a nonzero frequency - * packed into a 32-bit integer. */ + /* + * 'num_used_syms' is the number of symbols with nonzero frequency. + * This may be less than @num_syms. 'num_used_syms' is also the number + * of entries in 'A' that are valid. Each entry consists of a distinct + * symbol and a nonzero frequency packed into a 32-bit integer. + */ - /* Handle special cases where only 0 or 1 symbols were used (had - * nonzero frequency). */ + /* + * Handle special cases where only 0 or 1 symbols were used (had nonzero + * frequency). + */ if (unlikely(num_used_syms == 0)) { - /* Code is empty. sort_symbols() already set all lengths - * to 0, so there is nothing more to do. */ + /* + * Code is empty. sort_symbols() already set all lengths to 0, + * so there is nothing more to do. + */ return; } if (unlikely(num_used_syms == 1)) { - /* Only one symbol was used, so we only need one - * codeword. But two codewords are needed to form the - * smallest complete Huffman code, which uses codewords 0 - * and 1. Therefore, we choose another symbol to which - * to assign a codeword. We use 0 (if the used symbol is - * not 0) or 1 (if the used symbol is 0). In either - * case, the lesser-valued symbol must be assigned - * codeword 0 so that the resulting code is canonical. */ + /* + * Only one symbol was used, so we only need one codeword. But + * two codewords are needed to form the smallest complete + * Huffman code, which uses codewords 0 and 1. Therefore, we + * choose another symbol to which to assign a codeword. We use + * 0 (if the used symbol is not 0) or 1 (if the used symbol is + * 0). In either case, the lesser-valued symbol must be + * assigned codeword 0 so that the resulting code is canonical. + */ unsigned sym = A[0] & SYMBOL_MASK; unsigned nonzero_idx = sym ? sym : 1; @@ -638,14 +652,16 @@ make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len, return; } - /* Build a stripped-down version of the Huffman tree, sharing the - * array 'A' with the symbol values. Then extract length counts - * from the tree and use them to generate the final codewords. */ + /* + * Build a stripped-down version of the Huffman tree, sharing the array + * 'A' with the symbol values. Then extract length counts from the tree + * and use them to generate the final codewords. + */ build_tree(A, num_used_syms); { - unsigned len_counts[max_codeword_len + 1]; + unsigned len_counts[MAX_CODEWORD_LEN + 1]; compute_length_counts(A, num_used_syms - 2, len_counts, max_codeword_len); diff --git a/src/divsufsort.c b/src/divsufsort.c index c80412f5..81a13f97 100644 --- a/src/divsufsort.c +++ b/src/divsufsort.c @@ -56,9 +56,6 @@ /*- Macros -*/ -#define SWAP swap -#define MIN min -#define MAX max #define STACK_PUSH(_a, _b, _c, _d)\ do {\ -- 2.43.0