/*
* decompress_common.c
*
* Code for decompression shared among multiple compression formats.
*
* The following copying information applies to this specific source code file:
*
* Written in 2012-2016 by Eric Biggers
*
* To the extent possible under law, the author(s) have dedicated all copyright
* and related and neighboring rights to this software to the public domain
* worldwide via the Creative Commons Zero 1.0 Universal Public Domain
* Dedication (the "CC0").
*
* This software is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the CC0 for more details.
*
* You should have received a copy of the CC0 along with this software; if not
* see .
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include
#ifdef __SSE2__
# include
#endif
#include "wimlib/decompress_common.h"
/*
* make_huffman_decode_table() -
*
* Given an alphabet of symbols and the length of each symbol's codeword in a
* canonical prefix code, build a table for quickly decoding symbols that were
* encoded with that code.
*
* A _prefix code_ is an assignment of bitstrings called _codewords_ to symbols
* such that no whole codeword is a prefix of any other. A prefix code might be
* a _Huffman code_, which means that it is an optimum prefix code for a given
* list of symbol frequencies and was generated by the Huffman algorithm.
* Although the prefix codes processed here will ordinarily be "Huffman codes",
* strictly speaking the decoder cannot know whether a given code was actually
* generated by the Huffman algorithm or not.
*
* A prefix code is _canonical_ if and only if a longer codeword never
* lexicographically precedes a shorter codeword, and the lexicographic ordering
* of codewords of equal length is the same as the lexicographic ordering of the
* corresponding symbols. The advantage of using a canonical prefix code is
* that the codewords can be reconstructed from only the symbol => codeword
* length mapping. This eliminates the need to transmit the codewords
* explicitly. Instead, they can be enumerated in lexicographic order after
* sorting the symbols primarily by increasing codeword length and secondarily
* by increasing symbol value.
*
* However, the decoder's real goal is to decode symbols with the code, not just
* generate the list of codewords. Consequently, this function directly builds
* a table for efficiently decoding symbols using the code. The basic idea is
* that given the next 'max_codeword_len' bits of input, the decoder can look up
* the next decoded symbol by indexing a table containing '2^max_codeword_len'
* entries. A codeword with length 'max_codeword_len' will have exactly one
* entry in this table, whereas a codeword shorter than 'max_codeword_len' will
* have multiple entries in this table. Precisely, a codeword of length 'n'
* will have '2^(max_codeword_len - n)' entries. The index of each such entry,
* considered as a bitstring of length 'max_codeword_len', will contain the
* corresponding codeword as a prefix.
*
* That's the basic idea, but we extend it in two ways:
*
* - Often the maximum codeword length is too long for it to be efficient to
* build the full decode table whenever a new code is used. Instead, we build
* a "root" table using only '2^table_bits' entries, where 'table_bits <=
* max_codeword_len'. Then, a lookup of 'table_bits' bits produces either a
* symbol directly (for codewords not longer than 'table_bits'), or the index
* of a subtable which must be indexed with additional bits of input to fully
* decode the symbol (for codewords longer than 'table_bits').
*
* - Whenever the decoder decodes a symbol, it needs to know the codeword length
* so that it can remove the appropriate number of input bits. The obvious
* solution would be to simply retain the codeword lengths array and use the
* decoded symbol as an index into it. However, that would require two array
* accesses when decoding each symbol. Our strategy is to instead store the
* codeword length directly in the decode table entry along with the symbol.
*
* See MAKE_DECODE_TABLE_ENTRY() for full details on the format of decode table
* entries, and see read_huffsym() for full details on how symbols are decoded.
*
* @decode_table:
* The array in which to build the decode table. This must have been
* declared by the DECODE_TABLE() macro. This may alias @lens, since all
* @lens are consumed before the decode table is written to.
*
* @num_syms:
* The number of symbols in the alphabet.
*
* @table_bits:
* The log base 2 of the number of entries in the root table.
*
* @lens:
* An array of length @num_syms, indexed by symbol, that gives the length
* of the codeword, in bits, for each symbol. The length can be 0, which
* means that the symbol does not have a codeword assigned. In addition,
* @lens may alias @decode_table, as noted above.
*
* @max_codeword_len:
* The maximum codeword length permitted for this code. All entries in
* 'lens' must be less than or equal to this value.
*
* Returns 0 on success, or -1 if the lengths do not form a valid prefix code.
*/
int
make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
unsigned table_bits, const u8 lens[],
unsigned max_codeword_len)
{
u16 len_counts[max_codeword_len + 1];
u16 offsets[max_codeword_len + 1];
u16 sorted_syms[num_syms];
s32 remainder = 1;
void *entry_ptr = decode_table;
unsigned codeword_len = 1;
unsigned sym_idx;
unsigned codeword;
unsigned subtable_pos;
unsigned subtable_bits;
unsigned subtable_prefix;
/* Count how many codewords have each length, including 0. */
for (unsigned len = 0; len <= max_codeword_len; len++)
len_counts[len] = 0;
for (unsigned sym = 0; sym < num_syms; sym++)
len_counts[lens[sym]]++;
/* It is already guaranteed that all lengths are <= max_codeword_len,
* but it cannot be assumed they form a complete prefix code. A
* codeword of length n should require a proportion of the codespace
* equaling (1/2)^n. The code is complete if and only if, by this
* measure, the codespace is exactly filled by the lengths. */
for (unsigned len = 1; len <= max_codeword_len; len++) {
remainder = (remainder << 1) - len_counts[len];
/* Do the lengths overflow the codespace? */
if (unlikely(remainder < 0))
return -1;
}
if (remainder != 0) {
/* The lengths do not fill the codespace; that is, they form an
* incomplete code. This is permitted only if the code is empty
* (contains no symbols). */
if (unlikely(remainder != 1U << max_codeword_len))
return -1;
/* The code is empty. When processing a well-formed stream, the
* decode table need not be initialized in this case. However,
* we cannot assume the stream is well-formed, so we must
* initialize the decode table anyway. Setting all entries to 0
* makes the decode table always produce symbol '0' without
* consuming any bits, which is good enough. */
memset(decode_table, 0, sizeof(decode_table[0]) << table_bits);
return 0;
}
/* Sort the symbols primarily by increasing codeword length and
* secondarily by increasing symbol value. */
/* Initialize 'offsets' so that 'offsets[len]' is the number of
* codewords shorter than 'len' bits, including length 0. */
offsets[0] = 0;
for (unsigned len = 0; len < max_codeword_len; len++)
offsets[len + 1] = offsets[len] + len_counts[len];
/* Use the 'offsets' array to sort the symbols. */
for (unsigned sym = 0; sym < num_syms; sym++)
sorted_syms[offsets[lens[sym]]++] = sym;
/*
* Fill the root table entries for codewords no longer than table_bits.
*
* The table will start with entries for the shortest codeword(s), which
* will have the most entries. From there, the number of entries per
* codeword will decrease. As an optimization, we may begin filling
* entries with SSE2 vector accesses (8 entries/store), then change to
* word accesses (2 or 4 entries/store), then change to 16-bit accesses
* (1 entry/store).
*/
sym_idx = offsets[0];
#ifdef __SSE2__
/* Fill entries one 128-bit vector (8 entries) at a time. */
for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
(sizeof(__m128i) / sizeof(decode_table[0]));
stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
{
unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
for (; sym_idx < end_sym_idx; sym_idx++) {
/* Note: unlike in the "word" version below, the __m128i
* type already has __attribute__((may_alias)), so using
* it to access an array of u16 will not violate strict
* aliasing. */
__m128i v = _mm_set1_epi16(
MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
codeword_len));
unsigned n = stores_per_loop;
do {
*(__m128i *)entry_ptr = v;
entry_ptr += sizeof(v);
} while (--n);
}
}
#endif /* __SSE2__ */
#ifdef __GNUC__
/* Fill entries one word (2 or 4 entries) at a time. */
for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
(WORDBYTES / sizeof(decode_table[0]));
stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
{
unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
for (; sym_idx < end_sym_idx; sym_idx++) {
/* Accessing the array of u16 as u32 or u64 would
* violate strict aliasing and would require compiling
* the code with -fno-strict-aliasing to guarantee
* correctness. To work around this problem, use the
* gcc 'may_alias' extension. */
typedef machine_word_t
__attribute__((may_alias)) aliased_word_t;
aliased_word_t v = repeat_u16(
MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
codeword_len));
unsigned n = stores_per_loop;
do {
*(aliased_word_t *)entry_ptr = v;
entry_ptr += sizeof(v);
} while (--n);
}
}
#endif /* __GNUC__ */
/* Fill entries one at a time. */
for (unsigned stores_per_loop = (1U << (table_bits - codeword_len));
stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
{
unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
for (; sym_idx < end_sym_idx; sym_idx++) {
u16 v = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
codeword_len);
unsigned n = stores_per_loop;
do {
*(u16 *)entry_ptr = v;
entry_ptr += sizeof(v);
} while (--n);
}
}
/* If all symbols were processed, then no subtables are required. */
if (sym_idx == num_syms)
return 0;
/* At least one subtable is required. Process the remaining symbols. */
codeword = ((u16 *)entry_ptr - decode_table) << 1;
subtable_pos = 1U << table_bits;
subtable_bits = table_bits;
subtable_prefix = -1;
do {
while (len_counts[codeword_len] == 0) {
codeword_len++;
codeword <<= 1;
}
unsigned prefix = codeword >> (codeword_len - table_bits);
/* Start a new subtable if the first 'table_bits' bits of the
* codeword don't match the prefix for the previous subtable, or
* if this will be the first subtable. */
if (prefix != subtable_prefix) {
subtable_prefix = prefix;
/*
* Calculate the subtable length. If the codeword
* length exceeds 'table_bits' by n, then the subtable
* needs at least 2^n entries. But it may need more; if
* there are fewer than 2^n codewords of length
* 'table_bits + n' remaining, then n will need to be
* incremented to bring in longer codewords until the
* subtable can be filled completely. Note that it
* always will, eventually, be possible to fill the
* subtable, since it was previously verified that the
* code is complete.
*/
subtable_bits = codeword_len - table_bits;
remainder = (s32)1 << subtable_bits;
for (;;) {
remainder -= len_counts[table_bits +
subtable_bits];
if (remainder <= 0)
break;
subtable_bits++;
remainder <<= 1;
}
/* Create the entry that points from the root table to
* the subtable. This entry contains the index of the
* start of the subtable and the number of bits with
* which the subtable is indexed (the log base 2 of the
* number of entries it contains). */
decode_table[subtable_prefix] =
MAKE_DECODE_TABLE_ENTRY(subtable_pos,
subtable_bits);
}
/* Fill the subtable entries for this symbol. */
u16 entry = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
codeword_len - table_bits);
unsigned n = 1U << (subtable_bits - (codeword_len -
table_bits));
do {
decode_table[subtable_pos++] = entry;
} while (--n);
len_counts[codeword_len]--;
codeword++;
} while (++sym_idx < num_syms);
return 0;
}