wimlib.net Git - wimlib/blob - src/decompress_common.c

   1 /*
   2  * decompress_common.c
   3  *
   4  * Code for decompression shared among multiple compression formats.
   5  *
   6  * The following copying information applies to this specific source code file:
   7  *
   8  * Written in 2012-2016 by Eric Biggers <ebiggers3@gmail.com>
   9  *
  10  * To the extent possible under law, the author(s) have dedicated all copyright
  11  * and related and neighboring rights to this software to the public domain
  12  * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
  13  * Dedication (the "CC0").
  14  *
  15  * This software is distributed in the hope that it will be useful, but WITHOUT
  16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17  * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
  18  *
  19  * You should have received a copy of the CC0 along with this software; if not
  20  * see <http://creativecommons.org/publicdomain/zero/1.0/>.
  21  */
  22
  23 #ifdef HAVE_CONFIG_H
  24 #  include "config.h"
  25 #endif
  26
  27 #include <string.h>
  28
  29 #ifdef __SSE2__
  30 #  include <emmintrin.h>
  31 #endif
  32
  33 #include "wimlib/decompress_common.h"
  34
  35 /*
  36  * make_huffman_decode_table() -
  37  *
  38  * Given an alphabet of symbols and the length of each symbol's codeword in a
  39  * canonical prefix code, build a table for quickly decoding symbols that were
  40  * encoded with that code.
  41  *
  42  * A _prefix code_ is an assignment of bitstrings called _codewords_ to symbols
  43  * such that no whole codeword is a prefix of any other.  A prefix code might be
  44  * a _Huffman code_, which means that it is an optimum prefix code for a given
  45  * list of symbol frequencies and was generated by the Huffman algorithm.
  46  * Although the prefix codes processed here will ordinarily be "Huffman codes",
  47  * strictly speaking the decoder cannot know whether a given code was actually
  48  * generated by the Huffman algorithm or not.
  49  *
  50  * A prefix code is _canonical_ if and only if a longer codeword never
  51  * lexicographically precedes a shorter codeword, and the lexicographic ordering
  52  * of codewords of equal length is the same as the lexicographic ordering of the
  53  * corresponding symbols.  The advantage of using a canonical prefix code is
  54  * that the codewords can be reconstructed from only the symbol => codeword
  55  * length mapping.  This eliminates the need to transmit the codewords
  56  * explicitly.  Instead, they can be enumerated in lexicographic order after
  57  * sorting the symbols primarily by increasing codeword length and secondarily
  58  * by increasing symbol value.
  59  *
  60  * However, the decoder's real goal is to decode symbols with the code, not just
  61  * generate the list of codewords.  Consequently, this function directly builds
  62  * a table for efficiently decoding symbols using the code.  The basic idea is
  63  * that given the next 'max_codeword_len' bits of input, the decoder can look up
  64  * the next decoded symbol by indexing a table containing '2^max_codeword_len'
  65  * entries.  A codeword with length 'max_codeword_len' will have exactly one
  66  * entry in this table, whereas a codeword shorter than 'max_codeword_len' will
  67  * have multiple entries in this table.  Precisely, a codeword of length 'n'
  68  * will have '2^(max_codeword_len - n)' entries.  The index of each such entry,
  69  * considered as a bitstring of length 'max_codeword_len', will contain the
  70  * corresponding codeword as a prefix.
  71  *
  72  * That's the basic idea, but we extend it in two ways:
  73  *
  74  * - Often the maximum codeword length is too long for it to be efficient to
  75  *   build the full decode table whenever a new code is used.  Instead, we build
  76  *   a "root" table using only '2^table_bits' entries, where 'table_bits <=
  77  *   max_codeword_len'.  Then, a lookup of 'table_bits' bits produces either a
  78  *   symbol directly (for codewords not longer than 'table_bits'), or the index
  79  *   of a subtable which must be indexed with additional bits of input to fully
  80  *   decode the symbol (for codewords longer than 'table_bits').
  81  *
  82  * - Whenever the decoder decodes a symbol, it needs to know the codeword length
  83  *   so that it can remove the appropriate number of input bits.  The obvious
  84  *   solution would be to simply retain the codeword lengths array and use the
  85  *   decoded symbol as an index into it.  However, that would require two array
  86  *   accesses when decoding each symbol.  Our strategy is to instead store the
  87  *   codeword length directly in the decode table entry along with the symbol.
  88  *
  89  * See MAKE_DECODE_TABLE_ENTRY() for full details on the format of decode table
  90  * entries, and see read_huffsym() for full details on how symbols are decoded.
  91  *
  92  * @decode_table:
  93  *      The array in which to build the decode table.  This must have been
  94  *      declared by the DECODE_TABLE() macro.  This may alias @lens, since all
  95  *      @lens are consumed before the decode table is written to.
  96  *
  97  * @num_syms:
  98  *      The number of symbols in the alphabet.
  99  *
 100  * @table_bits:
 101  *      The log base 2 of the number of entries in the root table.
 102  *
 103  * @lens:
 104  *      An array of length @num_syms, indexed by symbol, that gives the length
 105  *      of the codeword, in bits, for each symbol.  The length can be 0, which
 106  *      means that the symbol does not have a codeword assigned.  In addition,
 107  *      @lens may alias @decode_table, as noted above.
 108  *
 109  * @max_codeword_len:
 110  *      The maximum codeword length permitted for this code.  All entries in
 111  *      'lens' must be less than or equal to this value.
 112  *
 113  * Returns 0 on success, or -1 if the lengths do not form a valid prefix code.
 114  */
 115 int
 116 make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
 117                           unsigned table_bits, const u8 lens[],
 118                           unsigned max_codeword_len)
 119 {
 120         u16 len_counts[max_codeword_len + 1];
 121         u16 offsets[max_codeword_len + 1];
 122         u16 sorted_syms[num_syms];
 123         s32 remainder = 1;
 124         void *entry_ptr = decode_table;
 125         unsigned codeword_len = 1;
 126         unsigned sym_idx;
 127         unsigned codeword;
 128         unsigned subtable_pos;
 129         unsigned subtable_bits;
 130         unsigned subtable_prefix;
 131
 132         /* Count how many codewords have each length, including 0.  */
 133         for (unsigned len = 0; len <= max_codeword_len; len++)
 134                 len_counts[len] = 0;
 135         for (unsigned sym = 0; sym < num_syms; sym++)
 136                 len_counts[lens[sym]]++;
 137
 138         /* It is already guaranteed that all lengths are <= max_codeword_len,
 139          * but it cannot be assumed they form a complete prefix code.  A
 140          * codeword of length n should require a proportion of the codespace
 141          * equaling (1/2)^n.  The code is complete if and only if, by this
 142          * measure, the codespace is exactly filled by the lengths.  */
 143         for (unsigned len = 1; len <= max_codeword_len; len++) {
 144                 remainder = (remainder << 1) - len_counts[len];
 145                 /* Do the lengths overflow the codespace? */
 146                 if (unlikely(remainder < 0))
 147                         return -1;
 148         }
 149
 150         if (remainder != 0) {
 151                 /* The lengths do not fill the codespace; that is, they form an
 152                  * incomplete code.  This is permitted only if the code is empty
 153                  * (contains no symbols). */
 154
 155                 if (unlikely(remainder != 1U << max_codeword_len))
 156                         return -1;
 157
 158                 /* The code is empty.  When processing a well-formed stream, the
 159                  * decode table need not be initialized in this case.  However,
 160                  * we cannot assume the stream is well-formed, so we must
 161                  * initialize the decode table anyway.  Setting all entries to 0
 162                  * makes the decode table always produce symbol '0' without
 163                  * consuming any bits, which is good enough. */
 164                 memset(decode_table, 0, sizeof(decode_table[0]) << table_bits);
 165                 return 0;
 166         }
 167
 168         /* Sort the symbols primarily by increasing codeword length and
 169          * secondarily by increasing symbol value. */
 170
 171         /* Initialize 'offsets' so that 'offsets[len]' is the number of
 172          * codewords shorter than 'len' bits, including length 0. */
 173         offsets[0] = 0;
 174         for (unsigned len = 0; len < max_codeword_len; len++)
 175                 offsets[len + 1] = offsets[len] + len_counts[len];
 176
 177         /* Use the 'offsets' array to sort the symbols. */
 178         for (unsigned sym = 0; sym < num_syms; sym++)
 179                 sorted_syms[offsets[lens[sym]]++] = sym;
 180
 181         /*
 182          * Fill the root table entries for codewords no longer than table_bits.
 183          *
 184          * The table will start with entries for the shortest codeword(s), which
 185          * will have the most entries.  From there, the number of entries per
 186          * codeword will decrease.  As an optimization, we may begin filling
 187          * entries with SSE2 vector accesses (8 entries/store), then change to
 188          * word accesses (2 or 4 entries/store), then change to 16-bit accesses
 189          * (1 entry/store).
 190          */
 191         sym_idx = offsets[0];
 192
 193 #ifdef __SSE2__
 194         /* Fill entries one 128-bit vector (8 entries) at a time. */
 195         for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
 196                                     (sizeof(__m128i) / sizeof(decode_table[0]));
 197              stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
 198         {
 199                 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 200                 for (; sym_idx < end_sym_idx; sym_idx++) {
 201                         /* Note: unlike in the "word" version below, the __m128i
 202                          * type already has __attribute__((may_alias)), so using
 203                          * it to access an array of u16 will not violate strict
 204                          * aliasing.  */
 205                         __m128i v = _mm_set1_epi16(
 206                                 MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
 207                                                         codeword_len));
 208                         unsigned n = stores_per_loop;
 209                         do {
 210                                 *(__m128i *)entry_ptr = v;
 211                                 entry_ptr += sizeof(v);
 212                         } while (--n);
 213                 }
 214         }
 215 #endif /* __SSE2__ */
 216
 217 #ifdef __GNUC__
 218         /* Fill entries one word (2 or 4 entries) at a time. */
 219         for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
 220                                         (WORDBYTES / sizeof(decode_table[0]));
 221              stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
 222         {
 223                 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 224                 for (; sym_idx < end_sym_idx; sym_idx++) {
 225
 226                         /* Accessing the array of u16 as u32 or u64 would
 227                          * violate strict aliasing and would require compiling
 228                          * the code with -fno-strict-aliasing to guarantee
 229                          * correctness.  To work around this problem, use the
 230                          * gcc 'may_alias' extension.  */
 231                         typedef machine_word_t
 232                                 __attribute__((may_alias)) aliased_word_t;
 233                         aliased_word_t v = repeat_u16(
 234                                 MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
 235                                                         codeword_len));
 236                         unsigned n = stores_per_loop;
 237                         do {
 238                                 *(aliased_word_t *)entry_ptr = v;
 239                                 entry_ptr += sizeof(v);
 240                         } while (--n);
 241                 }
 242         }
 243 #endif /* __GNUC__ */
 244
 245         /* Fill entries one at a time. */
 246         for (unsigned stores_per_loop = (1U << (table_bits - codeword_len));
 247              stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
 248         {
 249                 unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 250                 for (; sym_idx < end_sym_idx; sym_idx++) {
 251                         u16 v = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
 252                                                         codeword_len);
 253                         unsigned n = stores_per_loop;
 254                         do {
 255                                 *(u16 *)entry_ptr = v;
 256                                 entry_ptr += sizeof(v);
 257                         } while (--n);
 258                 }
 259         }
 260
 261         /* If all symbols were processed, then no subtables are required. */
 262         if (sym_idx == num_syms)
 263                 return 0;
 264
 265         /* At least one subtable is required.  Process the remaining symbols. */
 266         codeword = ((u16 *)entry_ptr - decode_table) << 1;
 267         subtable_pos = 1U << table_bits;
 268         subtable_bits = table_bits;
 269         subtable_prefix = -1;
 270         do {
 271                 while (len_counts[codeword_len] == 0) {
 272                         codeword_len++;
 273                         codeword <<= 1;
 274                 }
 275
 276                 unsigned prefix = codeword >> (codeword_len - table_bits);
 277
 278                 /* Start a new subtable if the first 'table_bits' bits of the
 279                  * codeword don't match the prefix for the previous subtable, or
 280                  * if this will be the first subtable. */
 281                 if (prefix != subtable_prefix) {
 282
 283                         subtable_prefix = prefix;
 284
 285                         /*
 286                          * Calculate the subtable length.  If the codeword
 287                          * length exceeds 'table_bits' by n, then the subtable
 288                          * needs at least 2^n entries.  But it may need more; if
 289                          * there are fewer than 2^n codewords of length
 290                          * 'table_bits + n' remaining, then n will need to be
 291                          * incremented to bring in longer codewords until the
 292                          * subtable can be filled completely.  Note that it
 293                          * always will, eventually, be possible to fill the
 294                          * subtable, since it was previously verified that the
 295                          * code is complete.
 296                          */
 297                         subtable_bits = codeword_len - table_bits;
 298                         remainder = (s32)1 << subtable_bits;
 299                         for (;;) {
 300                                 remainder -= len_counts[table_bits +
 301                                                         subtable_bits];
 302                                 if (remainder <= 0)
 303                                         break;
 304                                 subtable_bits++;
 305                                 remainder <<= 1;
 306                         }
 307
 308                         /* Create the entry that points from the root table to
 309                          * the subtable.  This entry contains the index of the
 310                          * start of the subtable and the number of bits with
 311                          * which the subtable is indexed (the log base 2 of the
 312                          * number of entries it contains).  */
 313                         decode_table[subtable_prefix] =
 314                                 MAKE_DECODE_TABLE_ENTRY(subtable_pos,
 315                                                         subtable_bits);
 316                 }
 317
 318                 /* Fill the subtable entries for this symbol. */
 319                 u16 entry = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
 320                                                     codeword_len - table_bits);
 321                 unsigned n = 1U << (subtable_bits - (codeword_len -
 322                                                      table_bits));
 323                 do {
 324                         decode_table[subtable_pos++] = entry;
 325                 } while (--n);
 326
 327                 len_counts[codeword_len]--;
 328                 codeword++;
 329         } while (++sym_idx < num_syms);
 330
 331         return 0;
 332 }