4 * Code for decompression shared among multiple compression formats.
6 * Copyright 2022 Eric Biggers
8 * Permission is hereby granted, free of charge, to any person
9 * obtaining a copy of this software and associated documentation
10 * files (the "Software"), to deal in the Software without
11 * restriction, including without limitation the rights to use,
12 * copy, modify, merge, publish, distribute, sublicense, and/or sell
13 * copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following
17 * The above copyright notice and this permission notice shall be
18 * included in all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 * OTHER DEALINGS IN THE SOFTWARE.
37 # include <emmintrin.h>
40 #include "wimlib/decompress_common.h"
43 * make_huffman_decode_table() -
45 * Given an alphabet of symbols and the length of each symbol's codeword in a
46 * canonical prefix code, build a table for quickly decoding symbols that were
47 * encoded with that code.
49 * A _prefix code_ is an assignment of bitstrings called _codewords_ to symbols
50 * such that no whole codeword is a prefix of any other. A prefix code might be
51 * a _Huffman code_, which means that it is an optimum prefix code for a given
52 * list of symbol frequencies and was generated by the Huffman algorithm.
53 * Although the prefix codes processed here will ordinarily be "Huffman codes",
54 * strictly speaking the decoder cannot know whether a given code was actually
55 * generated by the Huffman algorithm or not.
57 * A prefix code is _canonical_ if and only if a longer codeword never
58 * lexicographically precedes a shorter codeword, and the lexicographic ordering
59 * of codewords of equal length is the same as the lexicographic ordering of the
60 * corresponding symbols. The advantage of using a canonical prefix code is
61 * that the codewords can be reconstructed from only the symbol => codeword
62 * length mapping. This eliminates the need to transmit the codewords
63 * explicitly. Instead, they can be enumerated in lexicographic order after
64 * sorting the symbols primarily by increasing codeword length and secondarily
65 * by increasing symbol value.
67 * However, the decoder's real goal is to decode symbols with the code, not just
68 * generate the list of codewords. Consequently, this function directly builds
69 * a table for efficiently decoding symbols using the code. The basic idea is
70 * that given the next 'max_codeword_len' bits of input, the decoder can look up
71 * the next decoded symbol by indexing a table containing '2^max_codeword_len'
72 * entries. A codeword with length 'max_codeword_len' will have exactly one
73 * entry in this table, whereas a codeword shorter than 'max_codeword_len' will
74 * have multiple entries in this table. Precisely, a codeword of length 'n'
75 * will have '2^(max_codeword_len - n)' entries. The index of each such entry,
76 * considered as a bitstring of length 'max_codeword_len', will contain the
77 * corresponding codeword as a prefix.
79 * That's the basic idea, but we extend it in two ways:
81 * - Often the maximum codeword length is too long for it to be efficient to
82 * build the full decode table whenever a new code is used. Instead, we build
83 * a "root" table using only '2^table_bits' entries, where 'table_bits <=
84 * max_codeword_len'. Then, a lookup of 'table_bits' bits produces either a
85 * symbol directly (for codewords not longer than 'table_bits'), or the index
86 * of a subtable which must be indexed with additional bits of input to fully
87 * decode the symbol (for codewords longer than 'table_bits').
89 * - Whenever the decoder decodes a symbol, it needs to know the codeword length
90 * so that it can remove the appropriate number of input bits. The obvious
91 * solution would be to simply retain the codeword lengths array and use the
92 * decoded symbol as an index into it. However, that would require two array
93 * accesses when decoding each symbol. Our strategy is to instead store the
94 * codeword length directly in the decode table entry along with the symbol.
96 * See MAKE_DECODE_TABLE_ENTRY() for full details on the format of decode table
97 * entries, and see read_huffsym() for full details on how symbols are decoded.
100 * The array in which to build the decode table. This must have been
101 * declared by the DECODE_TABLE() macro. This may alias @lens, since all
102 * @lens are consumed before the decode table is written to.
105 * The number of symbols in the alphabet.
108 * The log base 2 of the number of entries in the root table.
111 * An array of length @num_syms, indexed by symbol, that gives the length
112 * of the codeword, in bits, for each symbol. The length can be 0, which
113 * means that the symbol does not have a codeword assigned. In addition,
114 * @lens may alias @decode_table, as noted above.
117 * The maximum codeword length permitted for this code. All entries in
118 * 'lens' must be less than or equal to this value.
121 * A temporary array that was declared with DECODE_TABLE_WORKING_SPACE().
123 * Returns 0 on success, or -1 if the lengths do not form a valid prefix code.
126 make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
127 unsigned table_bits, const u8 lens[],
128 unsigned max_codeword_len, u16 working_space[])
130 u16 * const len_counts = &working_space[0];
131 u16 * const offsets = &working_space[1 * (max_codeword_len + 1)];
132 u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)];
134 void *entry_ptr = decode_table;
135 unsigned codeword_len = 1;
138 unsigned subtable_pos;
139 unsigned subtable_bits;
140 unsigned subtable_prefix;
142 /* Count how many codewords have each length, including 0. */
143 for (unsigned len = 0; len <= max_codeword_len; len++)
145 for (unsigned sym = 0; sym < num_syms; sym++)
146 len_counts[lens[sym]]++;
148 /* It is already guaranteed that all lengths are <= max_codeword_len,
149 * but it cannot be assumed they form a complete prefix code. A
150 * codeword of length n should require a proportion of the codespace
151 * equaling (1/2)^n. The code is complete if and only if, by this
152 * measure, the codespace is exactly filled by the lengths. */
153 for (unsigned len = 1; len <= max_codeword_len; len++) {
154 remainder = (remainder << 1) - len_counts[len];
155 /* Do the lengths overflow the codespace? */
156 if (unlikely(remainder < 0))
160 if (remainder != 0) {
161 /* The lengths do not fill the codespace; that is, they form an
162 * incomplete code. This is permitted only if the code is empty
163 * (contains no symbols). */
165 if (unlikely(remainder != 1U << max_codeword_len))
168 /* The code is empty. When processing a well-formed stream, the
169 * decode table need not be initialized in this case. However,
170 * we cannot assume the stream is well-formed, so we must
171 * initialize the decode table anyway. Setting all entries to 0
172 * makes the decode table always produce symbol '0' without
173 * consuming any bits, which is good enough. */
174 memset(decode_table, 0, sizeof(decode_table[0]) << table_bits);
178 /* Sort the symbols primarily by increasing codeword length and
179 * secondarily by increasing symbol value. */
181 /* Initialize 'offsets' so that 'offsets[len]' is the number of
182 * codewords shorter than 'len' bits, including length 0. */
184 for (unsigned len = 0; len < max_codeword_len; len++)
185 offsets[len + 1] = offsets[len] + len_counts[len];
187 /* Use the 'offsets' array to sort the symbols. */
188 for (unsigned sym = 0; sym < num_syms; sym++)
189 sorted_syms[offsets[lens[sym]]++] = sym;
192 * Fill the root table entries for codewords no longer than table_bits.
194 * The table will start with entries for the shortest codeword(s), which
195 * will have the most entries. From there, the number of entries per
196 * codeword will decrease. As an optimization, we may begin filling
197 * entries with SSE2 vector accesses (8 entries/store), then change to
198 * word accesses (2 or 4 entries/store), then change to 16-bit accesses
201 sym_idx = offsets[0];
204 /* Fill entries one 128-bit vector (8 entries) at a time. */
205 for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
206 (sizeof(__m128i) / sizeof(decode_table[0]));
207 stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
209 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
210 for (; sym_idx < end_sym_idx; sym_idx++) {
211 /* Note: unlike in the "word" version below, the __m128i
212 * type already has __attribute__((may_alias)), so using
213 * it to access an array of u16 will not violate strict
215 __m128i v = _mm_set1_epi16(
216 MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
218 unsigned n = stores_per_loop;
220 *(__m128i *)entry_ptr = v;
221 entry_ptr += sizeof(v);
225 #endif /* __SSE2__ */
228 /* Fill entries one word (2 or 4 entries) at a time. */
229 for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
230 (WORDBYTES / sizeof(decode_table[0]));
231 stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
233 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
234 for (; sym_idx < end_sym_idx; sym_idx++) {
236 /* Accessing the array of u16 as u32 or u64 would
237 * violate strict aliasing and would require compiling
238 * the code with -fno-strict-aliasing to guarantee
239 * correctness. To work around this problem, use the
240 * gcc 'may_alias' extension. */
241 typedef machine_word_t
242 __attribute__((may_alias)) aliased_word_t;
243 aliased_word_t v = repeat_u16(
244 MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
246 unsigned n = stores_per_loop;
248 *(aliased_word_t *)entry_ptr = v;
249 entry_ptr += sizeof(v);
253 #endif /* __GNUC__ */
255 /* Fill entries one at a time. */
256 for (unsigned stores_per_loop = (1U << (table_bits - codeword_len));
257 stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
259 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
260 for (; sym_idx < end_sym_idx; sym_idx++) {
261 u16 v = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
263 unsigned n = stores_per_loop;
265 *(u16 *)entry_ptr = v;
266 entry_ptr += sizeof(v);
271 /* If all symbols were processed, then no subtables are required. */
272 if (sym_idx == num_syms)
275 /* At least one subtable is required. Process the remaining symbols. */
276 codeword = ((u16 *)entry_ptr - decode_table) << 1;
277 subtable_pos = 1U << table_bits;
278 subtable_bits = table_bits;
279 subtable_prefix = -1;
281 while (len_counts[codeword_len] == 0) {
286 unsigned prefix = codeword >> (codeword_len - table_bits);
288 /* Start a new subtable if the first 'table_bits' bits of the
289 * codeword don't match the prefix for the previous subtable, or
290 * if this will be the first subtable. */
291 if (prefix != subtable_prefix) {
293 subtable_prefix = prefix;
296 * Calculate the subtable length. If the codeword
297 * length exceeds 'table_bits' by n, then the subtable
298 * needs at least 2^n entries. But it may need more; if
299 * there are fewer than 2^n codewords of length
300 * 'table_bits + n' remaining, then n will need to be
301 * incremented to bring in longer codewords until the
302 * subtable can be filled completely. Note that it
303 * always will, eventually, be possible to fill the
304 * subtable, since it was previously verified that the
307 subtable_bits = codeword_len - table_bits;
308 remainder = (s32)1 << subtable_bits;
310 remainder -= len_counts[table_bits +
318 /* Create the entry that points from the root table to
319 * the subtable. This entry contains the index of the
320 * start of the subtable and the number of bits with
321 * which the subtable is indexed (the log base 2 of the
322 * number of entries it contains). */
323 decode_table[subtable_prefix] =
324 MAKE_DECODE_TABLE_ENTRY(subtable_pos,
328 /* Fill the subtable entries for this symbol. */
329 u16 entry = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
330 codeword_len - table_bits);
331 unsigned n = 1U << (subtable_bits - (codeword_len -
334 decode_table[subtable_pos++] = entry;
337 len_counts[codeword_len]--;
339 } while (++sym_idx < num_syms);