wimlib.net Git - wimlib/blob - src/decompress_common.c

   1 /*
   2  * decompress_common.c
   3  *
   4  * Code for decompression shared among multiple compression formats.
   5  *
   6  * The following copying information applies to this specific source code file:
   7  *
   8  * Written in 2012-2016 by Eric Biggers <ebiggers3@gmail.com>
   9  *
  10  * To the extent possible under law, the author(s) have dedicated all copyright
  11  * and related and neighboring rights to this software to the public domain
  12  * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
  13  * Dedication (the "CC0").
  14  *
  15  * This software is distributed in the hope that it will be useful, but WITHOUT
  16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17  * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
  18  *
  19  * You should have received a copy of the CC0 along with this software; if not
  20  * see <http://creativecommons.org/publicdomain/zero/1.0/>.
  21  */
  22
  23 #ifdef HAVE_CONFIG_H
  24 #  include "config.h"
  25 #endif
  26
  27 #include <string.h>
  28
  29 #ifdef __SSE2__
  30 #  include <emmintrin.h>
  31 #endif
  32
  33 #include "wimlib/decompress_common.h"
  34
  35 #define MAKE_ENTRY(sym, len) (((sym) << DECODE_TABLE_SYMBOL_SHIFT) | (len))
  36
  37 /*
  38  * make_huffman_decode_table() -
  39  *
  40  * Build a decoding table for a canonical prefix code, or "Huffman code".  This
  41  * takes as input the length of the codeword for each symbol in the code and
  42  * produces as output a table for fast symbol decoding with read_huffsym().
  43  *
  44  * Because the code is assumed to be "canonical", it can be reconstructed
  45  * directly from the codeword lengths.  A prefix code is canonical if and only
  46  * if a longer codeword never lexicographically precedes a shorter codeword, and
  47  * the lexicographic ordering of codewords of the same length is the same as the
  48  * lexicographic ordering of the corresponding symbols.  Consequently, we can
  49  * sort the symbols primarily by codeword length and secondarily by symbol
  50  * value, then reconstruct the code by generating codewords lexicographically in
  51  * that order.
  52  *
  53  * This function does not, however, generate the code explicitly.  Instead, it
  54  * directly builds a table for decoding symbols using the code.  The basic idea
  55  * is this: given the next 'max_codeword_len' bits in the input, we can look up
  56  * the decoded symbol by indexing a table containing 2**max_codeword_len
  57  * entries.  A codeword with length 'max_codeword_len' will have exactly one
  58  * entry in this table, whereas a codeword shorter than 'max_codeword_len' will
  59  * have multiple entries in this table.  Precisely, a codeword of length n will
  60  * be represented by 2**(max_codeword_len - n) entries in this table.  The
  61  * 0-based index of each such entry will contain the corresponding codeword as a
  62  * prefix when zero-padded on the left to 'max_codeword_len' binary digits.
  63  *
  64  * That's the basic idea, but we implement two optimizations:
  65  *
  66  * - Often the maximum codeword length is too long for it to be efficient to
  67  *   build the full decoding table whenever a new code is used.  Instead, we can
  68  *   build the table using only 2**table_bits entries, where 'table_bits <=
  69  *   max_codeword_len'.  Then, a lookup of 'table_bits' bits will produce either
  70  *   a codeword directly (for codewords not longer than 'table_bits') or the
  71  *   index of a subtable which must be indexed with additional bits of input to
  72  *   decode the full codeword (for codewords longer than 'table_bits').
  73  *
  74  * - When we decode a symbol, we still need to know its codeword length so that
  75  *   the bitstream can be advanced by the appropriate number of bits.  The
  76  *   obvious solution is to simply retain the 'lens' array and use the decoded
  77  *   symbol as an index into it.  However, this requires two separate array
  78  *   accesses in the fast path.  The optimization is to store the length
  79  *   directly in the decode table, along with the symbol.
  80  *
  81  * @decode_table:
  82  *      The array in which to build the decode table.  This must have been
  83  *      declared by the DECODE_TABLE() macro.  This may alias @lens, since all
  84  *      @lens are consumed before the decode table is written to.
  85  *
  86  * @num_syms:
  87  *      The number of symbols in the alphabet.
  88  *
  89  * @table_bits:
  90  *      The log base 2 of the number of entries in the root table.
  91  *
  92  * @lens:
  93  *      An array of length @num_syms, indexable by symbol, that gives the length
  94  *      of the codeword, in bits, for that symbol.  The length can be 0, which
  95  *      means that the symbol does not have a codeword assigned.  In addition,
  96  *      @lens may alias @decode_table, as noted above.
  97  *
  98  * @max_codeword_len:
  99  *      The maximum codeword length permitted for this code.  All entries in
 100  *      'lens' must be less than or equal to this value.
 101  *
 102  * Returns 0 on success, or -1 if the lengths do not form a valid prefix code.
 103  */
 104 int
 105 make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
 106                           unsigned table_bits, const u8 lens[],
 107                           unsigned max_codeword_len)
 108 {
 109         u16 offsets[max_codeword_len + 1];
 110         u16 len_counts[max_codeword_len + 1];
 111         u16 sorted_syms[num_syms];
 112         s32 remainder = 1;
 113         void *entry_ptr = decode_table;
 114         unsigned codeword_len = 1;
 115         unsigned sym_idx;
 116         unsigned codeword;
 117         unsigned subtable_pos;
 118         unsigned subtable_bits;
 119         unsigned subtable_prefix;
 120
 121         /* Count how many codewords have each length, including 0.  */
 122         for (unsigned len = 0; len <= max_codeword_len; len++)
 123                 len_counts[len] = 0;
 124         for (unsigned sym = 0; sym < num_syms; sym++)
 125                 len_counts[lens[sym]]++;
 126
 127         /* It is already guaranteed that all lengths are <= max_codeword_len,
 128          * but it cannot be assumed they form a complete prefix code.  A
 129          * codeword of length n should require a proportion of the codespace
 130          * equaling (1/2)^n.  The code is complete if and only if, by this
 131          * measure, the codespace is exactly filled by the lengths.  */
 132         for (unsigned len = 1; len <= max_codeword_len; len++) {
 133                 remainder = (remainder << 1) - len_counts[len];
 134                 /* Do the lengths overflow the codespace? */
 135                 if (unlikely(remainder < 0))
 136                         return -1;
 137         }
 138
 139         if (remainder != 0) {
 140                 /* The lengths do not fill the codespace; that is, they form an
 141                  * incomplete code.  This is permitted only if the code is empty
 142                  * (contains no symbols). */
 143
 144                 if (unlikely(remainder != (s32)1 << max_codeword_len))
 145                         return -1;
 146
 147                 /* The code is empty.  When processing a well-formed stream, the
 148                  * decode table need not be initialized in this case.  However,
 149                  * we cannot assume the stream is well-formed, so we must
 150                  * initialize the decode table anyway.  Setting all entries to 0
 151                  * makes this table always produce symbol '0' without consuming
 152                  * any bits, which is good enough. */
 153                 memset(decode_table, 0,
 154                        (1U << table_bits) * sizeof(decode_table[0]));
 155                 return 0;
 156         }
 157
 158         /* Sort the symbols primarily by increasing codeword length and
 159          * secondarily by increasing symbol value. */
 160
 161         /* Initialize 'offsets' so that 'offsets[len]' is the number of
 162          * codewords shorter than 'len' bits, including length 0. */
 163         offsets[0] = 0;
 164         for (unsigned len = 0; len < max_codeword_len; len++)
 165                 offsets[len + 1] = offsets[len] + len_counts[len];
 166
 167         /* Use the 'offsets' array to sort the symbols. */
 168         for (unsigned sym = 0; sym < num_syms; sym++)
 169                 sorted_syms[offsets[lens[sym]]++] = sym;
 170
 171         /*
 172          * Fill entries for codewords with length <= table_bits
 173          * --- that is, those short enough for a direct mapping.
 174          *
 175          * The table will start with entries for the shortest codeword(s), which
 176          * have the most entries.  From there, the number of entries per
 177          * codeword will decrease.  As an optimization, we may begin filling
 178          * entries with SSE2 vector accesses (8 entries/store), then change to
 179          * 'machine_word_t' accesses (2 or 4 entries/store), then change to
 180          * 16-bit accesses (1 entry/store).
 181          */
 182         sym_idx = offsets[0];
 183 #ifdef __SSE2__
 184         /* Fill entries one 128-bit vector (8 entries) at a time. */
 185         for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
 186                                     (sizeof(__m128i) / sizeof(decode_table[0]));
 187              stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
 188         {
 189                 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 190                 for (; sym_idx < end_sym_idx; sym_idx++) {
 191                         /* Note: unlike in the "word" version below, the __m128i
 192                          * type already has __attribute__((may_alias)), so using
 193                          * it to access an array of u16 will not violate strict
 194                          * aliasing.  */
 195                         __m128i v = _mm_set1_epi16(
 196                                 MAKE_ENTRY(sorted_syms[sym_idx], codeword_len));
 197                         unsigned n = stores_per_loop;
 198                         do {
 199                                 *(__m128i *)entry_ptr = v;
 200                                 entry_ptr += sizeof(v);
 201                         } while (--n);
 202                 }
 203         }
 204 #endif /* __SSE2__ */
 205
 206 #ifdef __GNUC__
 207         /* Fill entries one word (2 or 4 entries) at a time. */
 208         for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
 209                                         (WORDBYTES / sizeof(decode_table[0]));
 210              stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
 211         {
 212                 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 213                 for (; sym_idx < end_sym_idx; sym_idx++) {
 214
 215                         /* Accessing the array of u16 as u32 or u64 would
 216                          * violate strict aliasing and would require compiling
 217                          * the code with -fno-strict-aliasing to guarantee
 218                          * correctness.  To work around this problem, use the
 219                          * gcc 'may_alias' extension.  */
 220                         typedef machine_word_t
 221                                 __attribute__((may_alias)) aliased_word_t;
 222                         aliased_word_t v = repeat_u16(
 223                                 MAKE_ENTRY(sorted_syms[sym_idx], codeword_len));
 224                         unsigned n = stores_per_loop;
 225
 226                         do {
 227                                 *(aliased_word_t *)entry_ptr = v;
 228                                 entry_ptr += sizeof(v);
 229                         } while (--n);
 230                 }
 231         }
 232 #endif /* __GNUC__ */
 233
 234         /* Fill entries one at a time. */
 235         for (unsigned stores_per_loop = (1U << (table_bits - codeword_len));
 236              stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
 237         {
 238                 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 239                 for (; sym_idx < end_sym_idx; sym_idx++) {
 240                         u16 v = MAKE_ENTRY(sorted_syms[sym_idx], codeword_len);
 241                         unsigned n = stores_per_loop;
 242                         do {
 243                                 *(u16 *)entry_ptr = v;
 244                                 entry_ptr += sizeof(v);
 245                         } while (--n);
 246                 }
 247         }
 248
 249         /* If all symbols were processed, then no subtables are required. */
 250         if (sym_idx == num_syms)
 251                 return 0;
 252
 253         /* At least one subtable is required.  Process the remaining symbols. */
 254         codeword = ((u16 *)entry_ptr - decode_table) << 1;
 255         subtable_pos = 1U << table_bits;
 256         subtable_bits = table_bits;
 257         subtable_prefix = -1;
 258         do {
 259                 while (len_counts[codeword_len] == 0) {
 260                         codeword_len++;
 261                         codeword <<= 1;
 262                 }
 263
 264                 unsigned prefix = codeword >> (codeword_len - table_bits);
 265
 266                 /* Start a new subtable if the first 'table_bits' bits of the
 267                  * codeword don't match the prefix for the previous subtable, or
 268                  * if this will be the first subtable. */
 269                 if (prefix != subtable_prefix) {
 270
 271                         subtable_prefix = prefix;
 272
 273                         /* Calculate the subtable length.  If the codeword
 274                          * length exceeds 'table_bits' by n, the subtable needs
 275                          * at least 2**n entries.  But it may need more; if
 276                          * there are fewer than 2**n codewords of length
 277                          * 'table_bits + n' remaining, then n will need to be
 278                          * incremented to bring in longer codewords until the
 279                          * subtable can be filled completely.  Note that it
 280                          * always will, eventually, be possible to fill the
 281                          * subtable, since it was previously verified that the
 282                          * code is complete. */
 283                         subtable_bits = codeword_len - table_bits;
 284                         remainder = (s32)1 << subtable_bits;
 285                         for (;;) {
 286                                 remainder -= len_counts[table_bits +
 287                                                         subtable_bits];
 288                                 if (remainder <= 0)
 289                                         break;
 290                                 subtable_bits++;
 291                                 remainder <<= 1;
 292                         }
 293
 294                         /* Create the entry that points from the root table to
 295                          * the subtable.  This entry contains the index of the
 296                          * start of the subtable and the number of bits with
 297                          * which the subtable is indexed (the log base 2 of the
 298                          * number of entries it contains).  */
 299                         decode_table[subtable_prefix] =
 300                                 MAKE_ENTRY(subtable_pos, subtable_bits);
 301                 }
 302
 303                 u16 entry = MAKE_ENTRY(sorted_syms[sym_idx],
 304                                        codeword_len - table_bits);
 305                 unsigned n = 1U << (subtable_bits - (codeword_len -
 306                                                      table_bits));
 307                 do {
 308                         decode_table[subtable_pos++] = entry;
 309                 } while (--n);
 310
 311                 len_counts[codeword_len]--;
 312                 codeword++;
 313         } while (++sym_idx < num_syms);
 314
 315         return 0;
 316 }