wimlib.net Git - wimlib/blob - src/decompress_common.c

   1 /*
   2  * decompress_common.c
   3  *
   4  * Code for decompression shared among multiple compression formats.
   5  *
   6  * Copyright 2022 Eric Biggers
   7  *
   8  * Permission is hereby granted, free of charge, to any person
   9  * obtaining a copy of this software and associated documentation
  10  * files (the "Software"), to deal in the Software without
  11  * restriction, including without limitation the rights to use,
  12  * copy, modify, merge, publish, distribute, sublicense, and/or sell
  13  * copies of the Software, and to permit persons to whom the
  14  * Software is furnished to do so, subject to the following
  15  * conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be
  18  * included in all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  22  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  23  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  24  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  25  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  27  * OTHER DEALINGS IN THE SOFTWARE.
  28  */
  29
  30 #ifdef HAVE_CONFIG_H
  31 #  include "config.h"
  32 #endif
  33
  34 #include <string.h>
  35
  36 #ifdef __SSE2__
  37 #  include <emmintrin.h>
  38 #endif
  39
  40 #include "wimlib/decompress_common.h"
  41
  42 /*
  43  * make_huffman_decode_table() -
  44  *
  45  * Given an alphabet of symbols and the length of each symbol's codeword in a
  46  * canonical prefix code, build a table for quickly decoding symbols that were
  47  * encoded with that code.
  48  *
  49  * A _prefix code_ is an assignment of bitstrings called _codewords_ to symbols
  50  * such that no whole codeword is a prefix of any other.  A prefix code might be
  51  * a _Huffman code_, which means that it is an optimum prefix code for a given
  52  * list of symbol frequencies and was generated by the Huffman algorithm.
  53  * Although the prefix codes processed here will ordinarily be "Huffman codes",
  54  * strictly speaking the decoder cannot know whether a given code was actually
  55  * generated by the Huffman algorithm or not.
  56  *
  57  * A prefix code is _canonical_ if and only if a longer codeword never
  58  * lexicographically precedes a shorter codeword, and the lexicographic ordering
  59  * of codewords of equal length is the same as the lexicographic ordering of the
  60  * corresponding symbols.  The advantage of using a canonical prefix code is
  61  * that the codewords can be reconstructed from only the symbol => codeword
  62  * length mapping.  This eliminates the need to transmit the codewords
  63  * explicitly.  Instead, they can be enumerated in lexicographic order after
  64  * sorting the symbols primarily by increasing codeword length and secondarily
  65  * by increasing symbol value.
  66  *
  67  * However, the decoder's real goal is to decode symbols with the code, not just
  68  * generate the list of codewords.  Consequently, this function directly builds
  69  * a table for efficiently decoding symbols using the code.  The basic idea is
  70  * that given the next 'max_codeword_len' bits of input, the decoder can look up
  71  * the next decoded symbol by indexing a table containing '2^max_codeword_len'
  72  * entries.  A codeword with length 'max_codeword_len' will have exactly one
  73  * entry in this table, whereas a codeword shorter than 'max_codeword_len' will
  74  * have multiple entries in this table.  Precisely, a codeword of length 'n'
  75  * will have '2^(max_codeword_len - n)' entries.  The index of each such entry,
  76  * considered as a bitstring of length 'max_codeword_len', will contain the
  77  * corresponding codeword as a prefix.
  78  *
  79  * That's the basic idea, but we extend it in two ways:
  80  *
  81  * - Often the maximum codeword length is too long for it to be efficient to
  82  *   build the full decode table whenever a new code is used.  Instead, we build
  83  *   a "root" table using only '2^table_bits' entries, where 'table_bits <=
  84  *   max_codeword_len'.  Then, a lookup of 'table_bits' bits produces either a
  85  *   symbol directly (for codewords not longer than 'table_bits'), or the index
  86  *   of a subtable which must be indexed with additional bits of input to fully
  87  *   decode the symbol (for codewords longer than 'table_bits').
  88  *
  89  * - Whenever the decoder decodes a symbol, it needs to know the codeword length
  90  *   so that it can remove the appropriate number of input bits.  The obvious
  91  *   solution would be to simply retain the codeword lengths array and use the
  92  *   decoded symbol as an index into it.  However, that would require two array
  93  *   accesses when decoding each symbol.  Our strategy is to instead store the
  94  *   codeword length directly in the decode table entry along with the symbol.
  95  *
  96  * See MAKE_DECODE_TABLE_ENTRY() for full details on the format of decode table
  97  * entries, and see read_huffsym() for full details on how symbols are decoded.
  98  *
  99  * @decode_table:
 100  *      The array in which to build the decode table.  This must have been
 101  *      declared by the DECODE_TABLE() macro.  This may alias @lens, since all
 102  *      @lens are consumed before the decode table is written to.
 103  *
 104  * @num_syms:
 105  *      The number of symbols in the alphabet.
 106  *
 107  * @table_bits:
 108  *      The log base 2 of the number of entries in the root table.
 109  *
 110  * @lens:
 111  *      An array of length @num_syms, indexed by symbol, that gives the length
 112  *      of the codeword, in bits, for each symbol.  The length can be 0, which
 113  *      means that the symbol does not have a codeword assigned.  In addition,
 114  *      @lens may alias @decode_table, as noted above.
 115  *
 116  * @max_codeword_len:
 117  *      The maximum codeword length permitted for this code.  All entries in
 118  *      'lens' must be less than or equal to this value.
 119  *
 120  * @working_space
 121  *      A temporary array that was declared with DECODE_TABLE_WORKING_SPACE().
 122  *
 123  * Returns 0 on success, or -1 if the lengths do not form a valid prefix code.
 124  */
 125 int
 126 make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
 127                           unsigned table_bits, const u8 lens[],
 128                           unsigned max_codeword_len, u16 working_space[])
 129 {
 130         u16 * const len_counts = &working_space[0];
 131         u16 * const offsets = &working_space[1 * (max_codeword_len + 1)];
 132         u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)];
 133         s32 remainder = 1;
 134         void *entry_ptr = decode_table;
 135         unsigned codeword_len = 1;
 136         unsigned sym_idx;
 137         unsigned codeword;
 138         unsigned subtable_pos;
 139         unsigned subtable_bits;
 140         unsigned subtable_prefix;
 141
 142         /* Count how many codewords have each length, including 0.  */
 143         for (unsigned len = 0; len <= max_codeword_len; len++)
 144                 len_counts[len] = 0;
 145         for (unsigned sym = 0; sym < num_syms; sym++)
 146                 len_counts[lens[sym]]++;
 147
 148         /* It is already guaranteed that all lengths are <= max_codeword_len,
 149          * but it cannot be assumed they form a complete prefix code.  A
 150          * codeword of length n should require a proportion of the codespace
 151          * equaling (1/2)^n.  The code is complete if and only if, by this
 152          * measure, the codespace is exactly filled by the lengths.  */
 153         for (unsigned len = 1; len <= max_codeword_len; len++) {
 154                 remainder = (remainder << 1) - len_counts[len];
 155                 /* Do the lengths overflow the codespace? */
 156                 if (unlikely(remainder < 0))
 157                         return -1;
 158         }
 159
 160         if (remainder != 0) {
 161                 /* The lengths do not fill the codespace; that is, they form an
 162                  * incomplete code.  This is permitted only if the code is empty
 163                  * (contains no symbols). */
 164
 165                 if (unlikely(remainder != 1U << max_codeword_len))
 166                         return -1;
 167
 168                 /* The code is empty.  When processing a well-formed stream, the
 169                  * decode table need not be initialized in this case.  However,
 170                  * we cannot assume the stream is well-formed, so we must
 171                  * initialize the decode table anyway.  Setting all entries to 0
 172                  * makes the decode table always produce symbol '0' without
 173                  * consuming any bits, which is good enough. */
 174                 memset(decode_table, 0, sizeof(decode_table[0]) << table_bits);
 175                 return 0;
 176         }
 177
 178         /* Sort the symbols primarily by increasing codeword length and
 179          * secondarily by increasing symbol value. */
 180
 181         /* Initialize 'offsets' so that 'offsets[len]' is the number of
 182          * codewords shorter than 'len' bits, including length 0. */
 183         offsets[0] = 0;
 184         for (unsigned len = 0; len < max_codeword_len; len++)
 185                 offsets[len + 1] = offsets[len] + len_counts[len];
 186
 187         /* Use the 'offsets' array to sort the symbols. */
 188         for (unsigned sym = 0; sym < num_syms; sym++)
 189                 sorted_syms[offsets[lens[sym]]++] = sym;
 190
 191         /*
 192          * Fill the root table entries for codewords no longer than table_bits.
 193          *
 194          * The table will start with entries for the shortest codeword(s), which
 195          * will have the most entries.  From there, the number of entries per
 196          * codeword will decrease.  As an optimization, we may begin filling
 197          * entries with SSE2 vector accesses (8 entries/store), then change to
 198          * word accesses (2 or 4 entries/store), then change to 16-bit accesses
 199          * (1 entry/store).
 200          */
 201         sym_idx = offsets[0];
 202
 203 #ifdef __SSE2__
 204         /* Fill entries one 128-bit vector (8 entries) at a time. */
 205         for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
 206                                     (sizeof(__m128i) / sizeof(decode_table[0]));
 207              stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
 208         {
 209                 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 210                 for (; sym_idx < end_sym_idx; sym_idx++) {
 211                         /* Note: unlike in the "word" version below, the __m128i
 212                          * type already has __attribute__((may_alias)), so using
 213                          * it to access an array of u16 will not violate strict
 214                          * aliasing.  */
 215                         __m128i v = _mm_set1_epi16(
 216                                 MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
 217                                                         codeword_len));
 218                         unsigned n = stores_per_loop;
 219                         do {
 220                                 *(__m128i *)entry_ptr = v;
 221                                 entry_ptr += sizeof(v);
 222                         } while (--n);
 223                 }
 224         }
 225 #endif /* __SSE2__ */
 226
 227 #ifdef __GNUC__
 228         /* Fill entries one word (2 or 4 entries) at a time. */
 229         for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
 230                                         (WORDBYTES / sizeof(decode_table[0]));
 231              stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
 232         {
 233                 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 234                 for (; sym_idx < end_sym_idx; sym_idx++) {
 235
 236                         /* Accessing the array of u16 as u32 or u64 would
 237                          * violate strict aliasing and would require compiling
 238                          * the code with -fno-strict-aliasing to guarantee
 239                          * correctness.  To work around this problem, use the
 240                          * gcc 'may_alias' extension.  */
 241                         typedef machine_word_t
 242                                 __attribute__((may_alias)) aliased_word_t;
 243                         aliased_word_t v = repeat_u16(
 244                                 MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
 245                                                         codeword_len));
 246                         unsigned n = stores_per_loop;
 247                         do {
 248                                 *(aliased_word_t *)entry_ptr = v;
 249                                 entry_ptr += sizeof(v);
 250                         } while (--n);
 251                 }
 252         }
 253 #endif /* __GNUC__ */
 254
 255         /* Fill entries one at a time. */
 256         for (unsigned stores_per_loop = (1U << (table_bits - codeword_len));
 257              stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
 258         {
 259                 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 260                 for (; sym_idx < end_sym_idx; sym_idx++) {
 261                         u16 v = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
 262                                                         codeword_len);
 263                         unsigned n = stores_per_loop;
 264                         do {
 265                                 *(u16 *)entry_ptr = v;
 266                                 entry_ptr += sizeof(v);
 267                         } while (--n);
 268                 }
 269         }
 270
 271         /* If all symbols were processed, then no subtables are required. */
 272         if (sym_idx == num_syms)
 273                 return 0;
 274
 275         /* At least one subtable is required.  Process the remaining symbols. */
 276         codeword = ((u16 *)entry_ptr - decode_table) << 1;
 277         subtable_pos = 1U << table_bits;
 278         subtable_bits = table_bits;
 279         subtable_prefix = -1;
 280         do {
 281                 while (len_counts[codeword_len] == 0) {
 282                         codeword_len++;
 283                         codeword <<= 1;
 284                 }
 285
 286                 unsigned prefix = codeword >> (codeword_len - table_bits);
 287
 288                 /* Start a new subtable if the first 'table_bits' bits of the
 289                  * codeword don't match the prefix for the previous subtable, or
 290                  * if this will be the first subtable. */
 291                 if (prefix != subtable_prefix) {
 292
 293                         subtable_prefix = prefix;
 294
 295                         /*
 296                          * Calculate the subtable length.  If the codeword
 297                          * length exceeds 'table_bits' by n, then the subtable
 298                          * needs at least 2^n entries.  But it may need more; if
 299                          * there are fewer than 2^n codewords of length
 300                          * 'table_bits + n' remaining, then n will need to be
 301                          * incremented to bring in longer codewords until the
 302                          * subtable can be filled completely.  Note that it
 303                          * always will, eventually, be possible to fill the
 304                          * subtable, since it was previously verified that the
 305                          * code is complete.
 306                          */
 307                         subtable_bits = codeword_len - table_bits;
 308                         remainder = (s32)1 << subtable_bits;
 309                         for (;;) {
 310                                 remainder -= len_counts[table_bits +
 311                                                         subtable_bits];
 312                                 if (remainder <= 0)
 313                                         break;
 314                                 subtable_bits++;
 315                                 remainder <<= 1;
 316                         }
 317
 318                         /* Create the entry that points from the root table to
 319                          * the subtable.  This entry contains the index of the
 320                          * start of the subtable and the number of bits with
 321                          * which the subtable is indexed (the log base 2 of the
 322                          * number of entries it contains).  */
 323                         decode_table[subtable_prefix] =
 324                                 MAKE_DECODE_TABLE_ENTRY(subtable_pos,
 325                                                         subtable_bits);
 326                 }
 327
 328                 /* Fill the subtable entries for this symbol. */
 329                 u16 entry = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
 330                                                     codeword_len - table_bits);
 331                 unsigned n = 1U << (subtable_bits - (codeword_len -
 332                                                      table_bits));
 333                 do {
 334                         decode_table[subtable_pos++] = entry;
 335                 } while (--n);
 336
 337                 len_counts[codeword_len]--;
 338                 codeword++;
 339         } while (++sym_idx < num_syms);
 340
 341         return 0;
 342 }