wimlib.net Git - wimlib/blob - src/decompress_common.c

   1 /*
   2  * decompress_common.c
   3  *
   4  * Code for decompression shared among multiple compression formats.
   5  *
   6  * The following copying information applies to this specific source code file:
   7  *
   8  * Written in 2012-2016 by Eric Biggers <ebiggers3@gmail.com>
   9  *
  10  * To the extent possible under law, the author(s) have dedicated all copyright
  11  * and related and neighboring rights to this software to the public domain
  12  * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
  13  * Dedication (the "CC0").
  14  *
  15  * This software is distributed in the hope that it will be useful, but WITHOUT
  16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17  * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
  18  *
  19  * You should have received a copy of the CC0 along with this software; if not
  20  * see <http://creativecommons.org/publicdomain/zero/1.0/>.
  21  */
  22
  23 #ifdef HAVE_CONFIG_H
  24 #  include "config.h"
  25 #endif
  26
  27 #include <string.h>
  28
  29 #ifdef __SSE2__
  30 #  include <emmintrin.h>
  31 #endif
  32
  33 #include "wimlib/decompress_common.h"
  34
  35 /*
  36  * make_huffman_decode_table() -
  37  *
  38  * Given an alphabet of symbols and the length of each symbol's codeword in a
  39  * canonical prefix code, build a table for quickly decoding symbols that were
  40  * encoded with that code.
  41  *
  42  * A _prefix code_ is an assignment of bitstrings called _codewords_ to symbols
  43  * such that no whole codeword is a prefix of any other.  A prefix code might be
  44  * a _Huffman code_, which means that it is an optimum prefix code for a given
  45  * list of symbol frequencies and was generated by the Huffman algorithm.
  46  * Although the prefix codes processed here will ordinarily be "Huffman codes",
  47  * strictly speaking the decoder cannot know whether a given code was actually
  48  * generated by the Huffman algorithm or not.
  49  *
  50  * A prefix code is _canonical_ if and only if a longer codeword never
  51  * lexicographically precedes a shorter codeword, and the lexicographic ordering
  52  * of codewords of equal length is the same as the lexicographic ordering of the
  53  * corresponding symbols.  The advantage of using a canonical prefix code is
  54  * that the codewords can be reconstructed from only the symbol => codeword
  55  * length mapping.  This eliminates the need to transmit the codewords
  56  * explicitly.  Instead, they can be enumerated in lexicographic order after
  57  * sorting the symbols primarily by increasing codeword length and secondarily
  58  * by increasing symbol value.
  59  *
  60  * However, the decoder's real goal is to decode symbols with the code, not just
  61  * generate the list of codewords.  Consequently, this function directly builds
  62  * a table for efficiently decoding symbols using the code.  The basic idea is
  63  * that given the next 'max_codeword_len' bits of input, the decoder can look up
  64  * the next decoded symbol by indexing a table containing '2^max_codeword_len'
  65  * entries.  A codeword with length 'max_codeword_len' will have exactly one
  66  * entry in this table, whereas a codeword shorter than 'max_codeword_len' will
  67  * have multiple entries in this table.  Precisely, a codeword of length 'n'
  68  * will have '2^(max_codeword_len - n)' entries.  The index of each such entry,
  69  * considered as a bitstring of length 'max_codeword_len', will contain the
  70  * corresponding codeword as a prefix.
  71  *
  72  * That's the basic idea, but we extend it in two ways:
  73  *
  74  * - Often the maximum codeword length is too long for it to be efficient to
  75  *   build the full decode table whenever a new code is used.  Instead, we build
  76  *   a "root" table using only '2^table_bits' entries, where 'table_bits <=
  77  *   max_codeword_len'.  Then, a lookup of 'table_bits' bits produces either a
  78  *   symbol directly (for codewords not longer than 'table_bits'), or the index
  79  *   of a subtable which must be indexed with additional bits of input to fully
  80  *   decode the symbol (for codewords longer than 'table_bits').
  81  *
  82  * - Whenever the decoder decodes a symbol, it needs to know the codeword length
  83  *   so that it can remove the appropriate number of input bits.  The obvious
  84  *   solution would be to simply retain the codeword lengths array and use the
  85  *   decoded symbol as an index into it.  However, that would require two array
  86  *   accesses when decoding each symbol.  Our strategy is to instead store the
  87  *   codeword length directly in the decode table entry along with the symbol.
  88  *
  89  * See MAKE_DECODE_TABLE_ENTRY() for full details on the format of decode table
  90  * entries, and see read_huffsym() for full details on how symbols are decoded.
  91  *
  92  * @decode_table:
  93  *      The array in which to build the decode table.  This must have been
  94  *      declared by the DECODE_TABLE() macro.  This may alias @lens, since all
  95  *      @lens are consumed before the decode table is written to.
  96  *
  97  * @num_syms:
  98  *      The number of symbols in the alphabet.
  99  *
 100  * @table_bits:
 101  *      The log base 2 of the number of entries in the root table.
 102  *
 103  * @lens:
 104  *      An array of length @num_syms, indexed by symbol, that gives the length
 105  *      of the codeword, in bits, for each symbol.  The length can be 0, which
 106  *      means that the symbol does not have a codeword assigned.  In addition,
 107  *      @lens may alias @decode_table, as noted above.
 108  *
 109  * @max_codeword_len:
 110  *      The maximum codeword length permitted for this code.  All entries in
 111  *      'lens' must be less than or equal to this value.
 112  *
 113  * @working_space
 114  *      A temporary array that was declared with DECODE_TABLE_WORKING_SPACE().
 115  *
 116  * Returns 0 on success, or -1 if the lengths do not form a valid prefix code.
 117  */
 118 int
 119 make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
 120                           unsigned table_bits, const u8 lens[],
 121                           unsigned max_codeword_len, u16 working_space[])
 122 {
 123         u16 * const len_counts = &working_space[0];
 124         u16 * const offsets = &working_space[1 * (max_codeword_len + 1)];
 125         u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)];
 126         s32 remainder = 1;
 127         void *entry_ptr = decode_table;
 128         unsigned codeword_len = 1;
 129         unsigned sym_idx;
 130         unsigned codeword;
 131         unsigned subtable_pos;
 132         unsigned subtable_bits;
 133         unsigned subtable_prefix;
 134
 135         /* Count how many codewords have each length, including 0.  */
 136         for (unsigned len = 0; len <= max_codeword_len; len++)
 137                 len_counts[len] = 0;
 138         for (unsigned sym = 0; sym < num_syms; sym++)
 139                 len_counts[lens[sym]]++;
 140
 141         /* It is already guaranteed that all lengths are <= max_codeword_len,
 142          * but it cannot be assumed they form a complete prefix code.  A
 143          * codeword of length n should require a proportion of the codespace
 144          * equaling (1/2)^n.  The code is complete if and only if, by this
 145          * measure, the codespace is exactly filled by the lengths.  */
 146         for (unsigned len = 1; len <= max_codeword_len; len++) {
 147                 remainder = (remainder << 1) - len_counts[len];
 148                 /* Do the lengths overflow the codespace? */
 149                 if (unlikely(remainder < 0))
 150                         return -1;
 151         }
 152
 153         if (remainder != 0) {
 154                 /* The lengths do not fill the codespace; that is, they form an
 155                  * incomplete code.  This is permitted only if the code is empty
 156                  * (contains no symbols). */
 157
 158                 if (unlikely(remainder != 1U << max_codeword_len))
 159                         return -1;
 160
 161                 /* The code is empty.  When processing a well-formed stream, the
 162                  * decode table need not be initialized in this case.  However,
 163                  * we cannot assume the stream is well-formed, so we must
 164                  * initialize the decode table anyway.  Setting all entries to 0
 165                  * makes the decode table always produce symbol '0' without
 166                  * consuming any bits, which is good enough. */
 167                 memset(decode_table, 0, sizeof(decode_table[0]) << table_bits);
 168                 return 0;
 169         }
 170
 171         /* Sort the symbols primarily by increasing codeword length and
 172          * secondarily by increasing symbol value. */
 173
 174         /* Initialize 'offsets' so that 'offsets[len]' is the number of
 175          * codewords shorter than 'len' bits, including length 0. */
 176         offsets[0] = 0;
 177         for (unsigned len = 0; len < max_codeword_len; len++)
 178                 offsets[len + 1] = offsets[len] + len_counts[len];
 179
 180         /* Use the 'offsets' array to sort the symbols. */
 181         for (unsigned sym = 0; sym < num_syms; sym++)
 182                 sorted_syms[offsets[lens[sym]]++] = sym;
 183
 184         /*
 185          * Fill the root table entries for codewords no longer than table_bits.
 186          *
 187          * The table will start with entries for the shortest codeword(s), which
 188          * will have the most entries.  From there, the number of entries per
 189          * codeword will decrease.  As an optimization, we may begin filling
 190          * entries with SSE2 vector accesses (8 entries/store), then change to
 191          * word accesses (2 or 4 entries/store), then change to 16-bit accesses
 192          * (1 entry/store).
 193          */
 194         sym_idx = offsets[0];
 195
 196 #ifdef __SSE2__
 197         /* Fill entries one 128-bit vector (8 entries) at a time. */
 198         for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
 199                                     (sizeof(__m128i) / sizeof(decode_table[0]));
 200              stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
 201         {
 202                 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 203                 for (; sym_idx < end_sym_idx; sym_idx++) {
 204                         /* Note: unlike in the "word" version below, the __m128i
 205                          * type already has __attribute__((may_alias)), so using
 206                          * it to access an array of u16 will not violate strict
 207                          * aliasing.  */
 208                         __m128i v = _mm_set1_epi16(
 209                                 MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
 210                                                         codeword_len));
 211                         unsigned n = stores_per_loop;
 212                         do {
 213                                 *(__m128i *)entry_ptr = v;
 214                                 entry_ptr += sizeof(v);
 215                         } while (--n);
 216                 }
 217         }
 218 #endif /* __SSE2__ */
 219
 220 #ifdef __GNUC__
 221         /* Fill entries one word (2 or 4 entries) at a time. */
 222         for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
 223                                         (WORDBYTES / sizeof(decode_table[0]));
 224              stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
 225         {
 226                 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 227                 for (; sym_idx < end_sym_idx; sym_idx++) {
 228
 229                         /* Accessing the array of u16 as u32 or u64 would
 230                          * violate strict aliasing and would require compiling
 231                          * the code with -fno-strict-aliasing to guarantee
 232                          * correctness.  To work around this problem, use the
 233                          * gcc 'may_alias' extension.  */
 234                         typedef machine_word_t
 235                                 __attribute__((may_alias)) aliased_word_t;
 236                         aliased_word_t v = repeat_u16(
 237                                 MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
 238                                                         codeword_len));
 239                         unsigned n = stores_per_loop;
 240                         do {
 241                                 *(aliased_word_t *)entry_ptr = v;
 242                                 entry_ptr += sizeof(v);
 243                         } while (--n);
 244                 }
 245         }
 246 #endif /* __GNUC__ */
 247
 248         /* Fill entries one at a time. */
 249         for (unsigned stores_per_loop = (1U << (table_bits - codeword_len));
 250              stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
 251         {
 252                 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 253                 for (; sym_idx < end_sym_idx; sym_idx++) {
 254                         u16 v = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
 255                                                         codeword_len);
 256                         unsigned n = stores_per_loop;
 257                         do {
 258                                 *(u16 *)entry_ptr = v;
 259                                 entry_ptr += sizeof(v);
 260                         } while (--n);
 261                 }
 262         }
 263
 264         /* If all symbols were processed, then no subtables are required. */
 265         if (sym_idx == num_syms)
 266                 return 0;
 267
 268         /* At least one subtable is required.  Process the remaining symbols. */
 269         codeword = ((u16 *)entry_ptr - decode_table) << 1;
 270         subtable_pos = 1U << table_bits;
 271         subtable_bits = table_bits;
 272         subtable_prefix = -1;
 273         do {
 274                 while (len_counts[codeword_len] == 0) {
 275                         codeword_len++;
 276                         codeword <<= 1;
 277                 }
 278
 279                 unsigned prefix = codeword >> (codeword_len - table_bits);
 280
 281                 /* Start a new subtable if the first 'table_bits' bits of the
 282                  * codeword don't match the prefix for the previous subtable, or
 283                  * if this will be the first subtable. */
 284                 if (prefix != subtable_prefix) {
 285
 286                         subtable_prefix = prefix;
 287
 288                         /*
 289                          * Calculate the subtable length.  If the codeword
 290                          * length exceeds 'table_bits' by n, then the subtable
 291                          * needs at least 2^n entries.  But it may need more; if
 292                          * there are fewer than 2^n codewords of length
 293                          * 'table_bits + n' remaining, then n will need to be
 294                          * incremented to bring in longer codewords until the
 295                          * subtable can be filled completely.  Note that it
 296                          * always will, eventually, be possible to fill the
 297                          * subtable, since it was previously verified that the
 298                          * code is complete.
 299                          */
 300                         subtable_bits = codeword_len - table_bits;
 301                         remainder = (s32)1 << subtable_bits;
 302                         for (;;) {
 303                                 remainder -= len_counts[table_bits +
 304                                                         subtable_bits];
 305                                 if (remainder <= 0)
 306                                         break;
 307                                 subtable_bits++;
 308                                 remainder <<= 1;
 309                         }
 310
 311                         /* Create the entry that points from the root table to
 312                          * the subtable.  This entry contains the index of the
 313                          * start of the subtable and the number of bits with
 314                          * which the subtable is indexed (the log base 2 of the
 315                          * number of entries it contains).  */
 316                         decode_table[subtable_prefix] =
 317                                 MAKE_DECODE_TABLE_ENTRY(subtable_pos,
 318                                                         subtable_bits);
 319                 }
 320
 321                 /* Fill the subtable entries for this symbol. */
 322                 u16 entry = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
 323                                                     codeword_len - table_bits);
 324                 unsigned n = 1U << (subtable_bits - (codeword_len -
 325                                                      table_bits));
 326                 do {
 327                         decode_table[subtable_pos++] = entry;
 328                 } while (--n);
 329
 330                 len_counts[codeword_len]--;
 331                 codeword++;
 332         } while (++sym_idx < num_syms);
 333
 334         return 0;
 335 }