4 * Code for decompression shared among multiple compression formats.
6 * The following copying information applies to this specific source code file:
8 * Written in 2012-2016 by Eric Biggers <ebiggers3@gmail.com>
10 * To the extent possible under law, the author(s) have dedicated all copyright
11 * and related and neighboring rights to this software to the public domain
12 * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
13 * Dedication (the "CC0").
15 * This software is distributed in the hope that it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
19 * You should have received a copy of the CC0 along with this software; if not
20 * see <http://creativecommons.org/publicdomain/zero/1.0/>.
30 # include <emmintrin.h>
33 #include "wimlib/decompress_common.h"
35 #define MAKE_ENTRY(sym, len) (((sym) << DECODE_TABLE_SYMBOL_SHIFT) | (len))
38 * make_huffman_decode_table() -
40 * Build a decoding table for a canonical prefix code, or "Huffman code". This
41 * takes as input the length of the codeword for each symbol in the code and
42 * produces as output a table for fast symbol decoding with read_huffsym().
44 * Because the code is assumed to be "canonical", it can be reconstructed
45 * directly from the codeword lengths. A prefix code is canonical if and only
46 * if a longer codeword never lexicographically precedes a shorter codeword, and
47 * the lexicographic ordering of codewords of the same length is the same as the
48 * lexicographic ordering of the corresponding symbols. Consequently, we can
49 * sort the symbols primarily by codeword length and secondarily by symbol
50 * value, then reconstruct the code by generating codewords lexicographically in
53 * This function does not, however, generate the code explicitly. Instead, it
54 * directly builds a table for decoding symbols using the code. The basic idea
55 * is this: given the next 'max_codeword_len' bits in the input, we can look up
56 * the decoded symbol by indexing a table containing 2**max_codeword_len
57 * entries. A codeword with length 'max_codeword_len' will have exactly one
58 * entry in this table, whereas a codeword shorter than 'max_codeword_len' will
59 * have multiple entries in this table. Precisely, a codeword of length n will
60 * be represented by 2**(max_codeword_len - n) entries in this table. The
61 * 0-based index of each such entry will contain the corresponding codeword as a
62 * prefix when zero-padded on the left to 'max_codeword_len' binary digits.
64 * That's the basic idea, but we implement two optimizations:
66 * - Often the maximum codeword length is too long for it to be efficient to
67 * build the full decoding table whenever a new code is used. Instead, we can
68 * build the table using only 2**table_bits entries, where 'table_bits <=
69 * max_codeword_len'. Then, a lookup of 'table_bits' bits will produce either
70 * a codeword directly (for codewords not longer than 'table_bits') or the
71 * index of a subtable which must be indexed with additional bits of input to
72 * decode the full codeword (for codewords longer than 'table_bits').
74 * - When we decode a symbol, we still need to know its codeword length so that
75 * the bitstream can be advanced by the appropriate number of bits. The
76 * obvious solution is to simply retain the 'lens' array and use the decoded
77 * symbol as an index into it. However, this requires two separate array
78 * accesses in the fast path. The optimization is to store the length
79 * directly in the decode table, along with the symbol.
82 * The array in which to build the decode table. This must have been
83 * declared by the DECODE_TABLE() macro. This may alias @lens, since all
84 * @lens are consumed before the decode table is written to.
87 * The number of symbols in the alphabet.
90 * The log base 2 of the number of entries in the root table.
93 * An array of length @num_syms, indexable by symbol, that gives the length
94 * of the codeword, in bits, for that symbol. The length can be 0, which
95 * means that the symbol does not have a codeword assigned. In addition,
96 * @lens may alias @decode_table, as noted above.
99 * The maximum codeword length permitted for this code. All entries in
100 * 'lens' must be less than or equal to this value.
102 * Returns 0 on success, or -1 if the lengths do not form a valid prefix code.
105 make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
106 unsigned table_bits, const u8 lens[],
107 unsigned max_codeword_len)
109 u16 offsets[max_codeword_len + 1];
110 u16 len_counts[max_codeword_len + 1];
111 u16 sorted_syms[num_syms];
113 void *entry_ptr = decode_table;
114 unsigned codeword_len = 1;
117 unsigned subtable_pos;
118 unsigned subtable_bits;
119 unsigned subtable_prefix;
121 /* Count how many codewords have each length, including 0. */
122 for (unsigned len = 0; len <= max_codeword_len; len++)
124 for (unsigned sym = 0; sym < num_syms; sym++)
125 len_counts[lens[sym]]++;
127 /* It is already guaranteed that all lengths are <= max_codeword_len,
128 * but it cannot be assumed they form a complete prefix code. A
129 * codeword of length n should require a proportion of the codespace
130 * equaling (1/2)^n. The code is complete if and only if, by this
131 * measure, the codespace is exactly filled by the lengths. */
132 for (unsigned len = 1; len <= max_codeword_len; len++) {
133 remainder = (remainder << 1) - len_counts[len];
134 /* Do the lengths overflow the codespace? */
135 if (unlikely(remainder < 0))
139 if (remainder != 0) {
140 /* The lengths do not fill the codespace; that is, they form an
141 * incomplete code. This is permitted only if the code is empty
142 * (contains no symbols). */
144 if (unlikely(remainder != (s32)1 << max_codeword_len))
147 /* The code is empty. When processing a well-formed stream, the
148 * decode table need not be initialized in this case. However,
149 * we cannot assume the stream is well-formed, so we must
150 * initialize the decode table anyway. Setting all entries to 0
151 * makes this table always produce symbol '0' without consuming
152 * any bits, which is good enough. */
153 memset(decode_table, 0,
154 (1U << table_bits) * sizeof(decode_table[0]));
158 /* Sort the symbols primarily by increasing codeword length and
159 * secondarily by increasing symbol value. */
161 /* Initialize 'offsets' so that 'offsets[len]' is the number of
162 * codewords shorter than 'len' bits, including length 0. */
164 for (unsigned len = 0; len < max_codeword_len; len++)
165 offsets[len + 1] = offsets[len] + len_counts[len];
167 /* Use the 'offsets' array to sort the symbols. */
168 for (unsigned sym = 0; sym < num_syms; sym++)
169 sorted_syms[offsets[lens[sym]]++] = sym;
172 * Fill entries for codewords with length <= table_bits
173 * --- that is, those short enough for a direct mapping.
175 * The table will start with entries for the shortest codeword(s), which
176 * have the most entries. From there, the number of entries per
177 * codeword will decrease. As an optimization, we may begin filling
178 * entries with SSE2 vector accesses (8 entries/store), then change to
179 * 'machine_word_t' accesses (2 or 4 entries/store), then change to
180 * 16-bit accesses (1 entry/store).
182 sym_idx = offsets[0];
184 /* Fill entries one 128-bit vector (8 entries) at a time. */
185 for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
186 (sizeof(__m128i) / sizeof(decode_table[0]));
187 stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
189 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
190 for (; sym_idx < end_sym_idx; sym_idx++) {
191 /* Note: unlike in the "word" version below, the __m128i
192 * type already has __attribute__((may_alias)), so using
193 * it to access an array of u16 will not violate strict
195 __m128i v = _mm_set1_epi16(
196 MAKE_ENTRY(sorted_syms[sym_idx], codeword_len));
197 unsigned n = stores_per_loop;
199 *(__m128i *)entry_ptr = v;
200 entry_ptr += sizeof(v);
204 #endif /* __SSE2__ */
207 /* Fill entries one word (2 or 4 entries) at a time. */
208 for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
209 (WORDBYTES / sizeof(decode_table[0]));
210 stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
212 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
213 for (; sym_idx < end_sym_idx; sym_idx++) {
215 /* Accessing the array of u16 as u32 or u64 would
216 * violate strict aliasing and would require compiling
217 * the code with -fno-strict-aliasing to guarantee
218 * correctness. To work around this problem, use the
219 * gcc 'may_alias' extension. */
220 typedef machine_word_t
221 __attribute__((may_alias)) aliased_word_t;
222 aliased_word_t v = repeat_u16(
223 MAKE_ENTRY(sorted_syms[sym_idx], codeword_len));
224 unsigned n = stores_per_loop;
227 *(aliased_word_t *)entry_ptr = v;
228 entry_ptr += sizeof(v);
232 #endif /* __GNUC__ */
234 /* Fill entries one at a time. */
235 for (unsigned stores_per_loop = (1U << (table_bits - codeword_len));
236 stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
238 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
239 for (; sym_idx < end_sym_idx; sym_idx++) {
240 u16 v = MAKE_ENTRY(sorted_syms[sym_idx], codeword_len);
241 unsigned n = stores_per_loop;
243 *(u16 *)entry_ptr = v;
244 entry_ptr += sizeof(v);
249 /* If all symbols were processed, then no subtables are required. */
250 if (sym_idx == num_syms)
253 /* At least one subtable is required. Process the remaining symbols. */
254 codeword = ((u16 *)entry_ptr - decode_table) << 1;
255 subtable_pos = 1U << table_bits;
256 subtable_bits = table_bits;
257 subtable_prefix = -1;
259 while (len_counts[codeword_len] == 0) {
264 unsigned prefix = codeword >> (codeword_len - table_bits);
266 /* Start a new subtable if the first 'table_bits' bits of the
267 * codeword don't match the prefix for the previous subtable, or
268 * if this will be the first subtable. */
269 if (prefix != subtable_prefix) {
271 subtable_prefix = prefix;
273 /* Calculate the subtable length. If the codeword
274 * length exceeds 'table_bits' by n, the subtable needs
275 * at least 2**n entries. But it may need more; if
276 * there are fewer than 2**n codewords of length
277 * 'table_bits + n' remaining, then n will need to be
278 * incremented to bring in longer codewords until the
279 * subtable can be filled completely. Note that it
280 * always will, eventually, be possible to fill the
281 * subtable, since it was previously verified that the
282 * code is complete. */
283 subtable_bits = codeword_len - table_bits;
284 remainder = (s32)1 << subtable_bits;
286 remainder -= len_counts[table_bits +
294 /* Create the entry that points from the root table to
295 * the subtable. This entry contains the index of the
296 * start of the subtable and the number of bits with
297 * which the subtable is indexed (the log base 2 of the
298 * number of entries it contains). */
299 decode_table[subtable_prefix] =
300 MAKE_ENTRY(subtable_pos, subtable_bits);
303 u16 entry = MAKE_ENTRY(sorted_syms[sym_idx],
304 codeword_len - table_bits);
305 unsigned n = 1U << (subtable_bits - (codeword_len -
308 decode_table[subtable_pos++] = entry;
311 len_counts[codeword_len]--;
313 } while (++sym_idx < num_syms);