4 * Code for decompression shared among multiple compression formats.
6 * The following copying information applies to this specific source code file:
8 * Written in 2012-2016 by Eric Biggers <ebiggers3@gmail.com>
10 * To the extent possible under law, the author(s) have dedicated all copyright
11 * and related and neighboring rights to this software to the public domain
12 * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
13 * Dedication (the "CC0").
15 * This software is distributed in the hope that it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
19 * You should have received a copy of the CC0 along with this software; if not
20 * see <http://creativecommons.org/publicdomain/zero/1.0/>.
30 # include <emmintrin.h>
33 #include "wimlib/decompress_common.h"
36 * make_huffman_decode_table() -
38 * Given an alphabet of symbols and the length of each symbol's codeword in a
39 * canonical prefix code, build a table for quickly decoding symbols that were
40 * encoded with that code.
42 * A _prefix code_ is an assignment of bitstrings called _codewords_ to symbols
43 * such that no whole codeword is a prefix of any other. A prefix code might be
44 * a _Huffman code_, which means that it is an optimum prefix code for a given
45 * list of symbol frequencies and was generated by the Huffman algorithm.
46 * Although the prefix codes processed here will ordinarily be "Huffman codes",
47 * strictly speaking the decoder cannot know whether a given code was actually
48 * generated by the Huffman algorithm or not.
50 * A prefix code is _canonical_ if and only if a longer codeword never
51 * lexicographically precedes a shorter codeword, and the lexicographic ordering
52 * of codewords of equal length is the same as the lexicographic ordering of the
53 * corresponding symbols. The advantage of using a canonical prefix code is
54 * that the codewords can be reconstructed from only the symbol => codeword
55 * length mapping. This eliminates the need to transmit the codewords
56 * explicitly. Instead, they can be enumerated in lexicographic order after
57 * sorting the symbols primarily by increasing codeword length and secondarily
58 * by increasing symbol value.
60 * However, the decoder's real goal is to decode symbols with the code, not just
61 * generate the list of codewords. Consequently, this function directly builds
62 * a table for efficiently decoding symbols using the code. The basic idea is
63 * that given the next 'max_codeword_len' bits of input, the decoder can look up
64 * the next decoded symbol by indexing a table containing '2^max_codeword_len'
65 * entries. A codeword with length 'max_codeword_len' will have exactly one
66 * entry in this table, whereas a codeword shorter than 'max_codeword_len' will
67 * have multiple entries in this table. Precisely, a codeword of length 'n'
68 * will have '2^(max_codeword_len - n)' entries. The index of each such entry,
69 * considered as a bitstring of length 'max_codeword_len', will contain the
70 * corresponding codeword as a prefix.
72 * That's the basic idea, but we extend it in two ways:
74 * - Often the maximum codeword length is too long for it to be efficient to
75 * build the full decode table whenever a new code is used. Instead, we build
76 * a "root" table using only '2^table_bits' entries, where 'table_bits <=
77 * max_codeword_len'. Then, a lookup of 'table_bits' bits produces either a
78 * symbol directly (for codewords not longer than 'table_bits'), or the index
79 * of a subtable which must be indexed with additional bits of input to fully
80 * decode the symbol (for codewords longer than 'table_bits').
82 * - Whenever the decoder decodes a symbol, it needs to know the codeword length
83 * so that it can remove the appropriate number of input bits. The obvious
84 * solution would be to simply retain the codeword lengths array and use the
85 * decoded symbol as an index into it. However, that would require two array
86 * accesses when decoding each symbol. Our strategy is to instead store the
87 * codeword length directly in the decode table entry along with the symbol.
89 * See MAKE_DECODE_TABLE_ENTRY() for full details on the format of decode table
90 * entries, and see read_huffsym() for full details on how symbols are decoded.
93 * The array in which to build the decode table. This must have been
94 * declared by the DECODE_TABLE() macro. This may alias @lens, since all
95 * @lens are consumed before the decode table is written to.
98 * The number of symbols in the alphabet.
101 * The log base 2 of the number of entries in the root table.
104 * An array of length @num_syms, indexed by symbol, that gives the length
105 * of the codeword, in bits, for each symbol. The length can be 0, which
106 * means that the symbol does not have a codeword assigned. In addition,
107 * @lens may alias @decode_table, as noted above.
110 * The maximum codeword length permitted for this code. All entries in
111 * 'lens' must be less than or equal to this value.
114 * A temporary array that was declared with DECODE_TABLE_WORKING_SPACE().
116 * Returns 0 on success, or -1 if the lengths do not form a valid prefix code.
119 make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
120 unsigned table_bits, const u8 lens[],
121 unsigned max_codeword_len, u16 working_space[])
123 u16 * const len_counts = &working_space[0];
124 u16 * const offsets = &working_space[1 * (max_codeword_len + 1)];
125 u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)];
127 void *entry_ptr = decode_table;
128 unsigned codeword_len = 1;
131 unsigned subtable_pos;
132 unsigned subtable_bits;
133 unsigned subtable_prefix;
135 /* Count how many codewords have each length, including 0. */
136 for (unsigned len = 0; len <= max_codeword_len; len++)
138 for (unsigned sym = 0; sym < num_syms; sym++)
139 len_counts[lens[sym]]++;
141 /* It is already guaranteed that all lengths are <= max_codeword_len,
142 * but it cannot be assumed they form a complete prefix code. A
143 * codeword of length n should require a proportion of the codespace
144 * equaling (1/2)^n. The code is complete if and only if, by this
145 * measure, the codespace is exactly filled by the lengths. */
146 for (unsigned len = 1; len <= max_codeword_len; len++) {
147 remainder = (remainder << 1) - len_counts[len];
148 /* Do the lengths overflow the codespace? */
149 if (unlikely(remainder < 0))
153 if (remainder != 0) {
154 /* The lengths do not fill the codespace; that is, they form an
155 * incomplete code. This is permitted only if the code is empty
156 * (contains no symbols). */
158 if (unlikely(remainder != 1U << max_codeword_len))
161 /* The code is empty. When processing a well-formed stream, the
162 * decode table need not be initialized in this case. However,
163 * we cannot assume the stream is well-formed, so we must
164 * initialize the decode table anyway. Setting all entries to 0
165 * makes the decode table always produce symbol '0' without
166 * consuming any bits, which is good enough. */
167 memset(decode_table, 0, sizeof(decode_table[0]) << table_bits);
171 /* Sort the symbols primarily by increasing codeword length and
172 * secondarily by increasing symbol value. */
174 /* Initialize 'offsets' so that 'offsets[len]' is the number of
175 * codewords shorter than 'len' bits, including length 0. */
177 for (unsigned len = 0; len < max_codeword_len; len++)
178 offsets[len + 1] = offsets[len] + len_counts[len];
180 /* Use the 'offsets' array to sort the symbols. */
181 for (unsigned sym = 0; sym < num_syms; sym++)
182 sorted_syms[offsets[lens[sym]]++] = sym;
185 * Fill the root table entries for codewords no longer than table_bits.
187 * The table will start with entries for the shortest codeword(s), which
188 * will have the most entries. From there, the number of entries per
189 * codeword will decrease. As an optimization, we may begin filling
190 * entries with SSE2 vector accesses (8 entries/store), then change to
191 * word accesses (2 or 4 entries/store), then change to 16-bit accesses
194 sym_idx = offsets[0];
197 /* Fill entries one 128-bit vector (8 entries) at a time. */
198 for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
199 (sizeof(__m128i) / sizeof(decode_table[0]));
200 stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
202 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
203 for (; sym_idx < end_sym_idx; sym_idx++) {
204 /* Note: unlike in the "word" version below, the __m128i
205 * type already has __attribute__((may_alias)), so using
206 * it to access an array of u16 will not violate strict
208 __m128i v = _mm_set1_epi16(
209 MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
211 unsigned n = stores_per_loop;
213 *(__m128i *)entry_ptr = v;
214 entry_ptr += sizeof(v);
218 #endif /* __SSE2__ */
221 /* Fill entries one word (2 or 4 entries) at a time. */
222 for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
223 (WORDBYTES / sizeof(decode_table[0]));
224 stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
226 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
227 for (; sym_idx < end_sym_idx; sym_idx++) {
229 /* Accessing the array of u16 as u32 or u64 would
230 * violate strict aliasing and would require compiling
231 * the code with -fno-strict-aliasing to guarantee
232 * correctness. To work around this problem, use the
233 * gcc 'may_alias' extension. */
234 typedef machine_word_t
235 __attribute__((may_alias)) aliased_word_t;
236 aliased_word_t v = repeat_u16(
237 MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
239 unsigned n = stores_per_loop;
241 *(aliased_word_t *)entry_ptr = v;
242 entry_ptr += sizeof(v);
246 #endif /* __GNUC__ */
248 /* Fill entries one at a time. */
249 for (unsigned stores_per_loop = (1U << (table_bits - codeword_len));
250 stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
252 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
253 for (; sym_idx < end_sym_idx; sym_idx++) {
254 u16 v = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
256 unsigned n = stores_per_loop;
258 *(u16 *)entry_ptr = v;
259 entry_ptr += sizeof(v);
264 /* If all symbols were processed, then no subtables are required. */
265 if (sym_idx == num_syms)
268 /* At least one subtable is required. Process the remaining symbols. */
269 codeword = ((u16 *)entry_ptr - decode_table) << 1;
270 subtable_pos = 1U << table_bits;
271 subtable_bits = table_bits;
272 subtable_prefix = -1;
274 while (len_counts[codeword_len] == 0) {
279 unsigned prefix = codeword >> (codeword_len - table_bits);
281 /* Start a new subtable if the first 'table_bits' bits of the
282 * codeword don't match the prefix for the previous subtable, or
283 * if this will be the first subtable. */
284 if (prefix != subtable_prefix) {
286 subtable_prefix = prefix;
289 * Calculate the subtable length. If the codeword
290 * length exceeds 'table_bits' by n, then the subtable
291 * needs at least 2^n entries. But it may need more; if
292 * there are fewer than 2^n codewords of length
293 * 'table_bits + n' remaining, then n will need to be
294 * incremented to bring in longer codewords until the
295 * subtable can be filled completely. Note that it
296 * always will, eventually, be possible to fill the
297 * subtable, since it was previously verified that the
300 subtable_bits = codeword_len - table_bits;
301 remainder = (s32)1 << subtable_bits;
303 remainder -= len_counts[table_bits +
311 /* Create the entry that points from the root table to
312 * the subtable. This entry contains the index of the
313 * start of the subtable and the number of bits with
314 * which the subtable is indexed (the log base 2 of the
315 * number of entries it contains). */
316 decode_table[subtable_prefix] =
317 MAKE_DECODE_TABLE_ENTRY(subtable_pos,
321 /* Fill the subtable entries for this symbol. */
322 u16 entry = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
323 codeword_len - table_bits);
324 unsigned n = 1U << (subtable_bits - (codeword_len -
327 decode_table[subtable_pos++] = entry;
330 len_counts[codeword_len]--;
332 } while (++sym_idx < num_syms);