wimlib.net Git - wimlib/blob - src/lz.c

   1 /*
   2  * lz.c
   3  *
   4  * This file provides the code to analyze a buffer of uncompressed data for
   5  * matches, as per the LZ77 algorithm.  It uses a hash table to accelerate the
   6  * process.  This is based on code from zlib (v. 1.2.5).
   7  */
   8
   9 /*
  10  * Copyright (C) 2012 Eric Biggers
  11  * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
  12  *
  13  * This file is part of wimlib, a library for working with WIM files.
  14  *
  15  * wimlib is free software; you can redistribute it and/or modify it under the
  16  * terms of the GNU Lesser General Public License as published by the Free
  17  * Software Foundation; either version 2.1 of the License, or (at your option)
  18  * any later version.
  19  *
  20  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
  21  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  22  * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  23  * details.
  24  *
  25  * You should have received a copy of the GNU Lesser General Public License
  26  * along with wimlib; if not, see http://www.gnu.org/licenses/.
  27  */
  28
  29 #include "comp.h"
  30 #include <string.h>
  31
  32 #define HASH_BITS       15
  33 #define HASH_SIZE       (1 << HASH_BITS)
  34 #define HASH_MASK       (HASH_SIZE - 1)
  35 #define HASH_SHIFT      ((HASH_BITS + 2) /  3)
  36
  37 /* Hash function, based on code from zlib.  This function will update and return
  38  * the hash value @hash for the string ending on the additional input character
  39  * @c.  This function must be called for each consecutive character, because it
  40  * uses a running hash value rather than computing it separately for each
  41  * 3-character string.
  42  *
  43  * The AND operation guarantees that only 3 characters will affect the hash
  44  * value, so every identical 3-character string will have the same hash value.
  45  */
  46 static inline uint update_hash(uint hash, u8 c)
  47 {
  48         return ((hash << HASH_SHIFT) ^ c) & HASH_MASK;
  49 }
  50
  51
  52 /* Insert a 3-character string at position @str_pos in @window and with hash
  53  * code @hash into the hash table described by @hash_tab and @prev_tab.  Based
  54  * on code from zlib.
  55  *
  56  * The hash table uses chains (linked lists) for the hash buckets, but there are
  57  * no real pointers involved.  Indexing `hash_tab' by hash value gives the index
  58  * within the window of the last string in the hash bucket.  To find the index
  59  * of the previous string in the hash chain, the `prev_tab' array is indexed by
  60  * the string index.  `prev_tab' can be indexed repeatedly by the string index
  61  * to walk through the hash chain, until the special index `0' is reached,
  62  * indicating the end of the hash chain.
  63  */
  64 static inline uint insert_string(u16 hash_tab[], u16 prev_tab[],
  65                                 const u8 window[], uint str_pos,
  66                                                         uint hash)
  67 {
  68         hash = update_hash(hash, window[str_pos + 2]);
  69         prev_tab[str_pos] = hash_tab[hash];
  70         hash_tab[hash] = str_pos;
  71         return hash;
  72 }
  73
  74
  75 /*
  76  * Returns the longest match for a given input position.
  77  *
  78  * @window:             The window of uncompressed data.
  79  * @bytes_remaining:    The number of bytes remaining in the window.
  80  * @strstart:           The index of the start of the string in the window that
  81  *                              we are trying to find a match for.
  82  * @prev_tab:           The array of prev pointers for the hash table.
  83  * @cur_match:          The index of the head of the hash chain for matches
  84  *                              having the hash value of the string beginning
  85  *                              at index @strstart.
  86  * @prev_len:           The length of the match that was found for the string
  87  *                              beginning at (@strstart - 1).
  88  * @match_start_ret:    A location into which the index of the start of the
  89  *                              match will be returned.
  90  * @params:             Parameters that affect how long the search will proceed
  91  *                              before going with the best that has been found
  92  *                              so far.
  93  *
  94  * Returns the length of the match that was found.
  95  */
  96 static uint longest_match(const u8 window[], uint bytes_remaining,
  97                           uint strstart, const u16 prev_tab[],
  98                           uint cur_match, uint prev_len,
  99                           uint *match_start_ret,
 100                           const struct lz_params *params)
 101 {
 102         uint chain_len = params->max_chain_len;
 103
 104         const u8 *scan = window + strstart;
 105         const u8 *match;
 106         uint len;
 107         uint best_len = prev_len;
 108         uint match_start = cur_match;
 109
 110         uint nice_match = min(params->nice_match, bytes_remaining);
 111
 112         const u8 *strend = scan + params->max_match;
 113
 114         u8 scan_end1 = scan[best_len - 1];
 115         u8 scan_end = scan[best_len];
 116
 117
 118         /* Do not waste too much time if we already have a good match: */
 119         if (best_len >= params->good_match)
 120                 chain_len >>= 2;
 121
 122         do {
 123                 match = &window[cur_match];
 124
 125
 126                 /* Skip to next match if the match length cannot increase
 127                  * or if the match length is less than 2.  Note that the checks below
 128                  * for insufficient lookahead only occur occasionally for performance
 129                  * reasons.  Therefore uninitialized memory will be accessed, and
 130                  * conditional jumps will be made that depend on those values.
 131                  * However the length of the match is limited to the lookahead, so
 132                  * the output of deflate is not affected by the uninitialized values.
 133                  */
 134
 135                 if (match[best_len]   != scan_end  ||
 136                         match[best_len-1] != scan_end1 ||
 137                         *match                  != *scan         ||
 138                         *++match                  != scan[1])     continue;
 139
 140                 /* The check at best_len-1 can be removed because it will be made
 141                  * again later. (This heuristic is not always a win.)
 142                  * It is not necessary to compare scan[2] and match[2] since they
 143                  * are always equal when the other bytes match, given that
 144                  * the hash keys are equal and that HASH_BITS >= 8.
 145                  */
 146                 scan += 2, match++;
 147
 148                 wimlib_assert(*scan == *match);
 149
 150                 /* We check for insufficient lookahead only every 8th comparison;
 151                  * the 256th check will be made at strstart+258.  */
 152                 do {
 153                 } while (*++scan == *++match && *++scan == *++match &&
 154                                  *++scan == *++match && *++scan == *++match &&
 155                                  *++scan == *++match && *++scan == *++match &&
 156                                  *++scan == *++match && *++scan == *++match &&
 157                                  scan < strend);
 158
 159                 len = params->max_match - (int)(strend - scan);
 160                 scan = strend - params->max_match;
 161
 162                 if (len > best_len) {
 163                         match_start = cur_match;
 164                         best_len = len;
 165                         if (len >= nice_match)
 166                                 break;
 167                         scan_end1  = scan[best_len - 1];
 168                         scan_end   = scan[best_len];
 169                 }
 170                 cur_match = prev_tab[cur_match];
 171         } while (--chain_len != 0 && cur_match != 0);
 172         *match_start_ret = match_start;
 173         return min(min(best_len, bytes_remaining), params->max_match);
 174 }
 175
 176
 177
 178 /*
 179  * Determines the sequence of matches and literals that a block of data will be
 180  * compressed to.
 181  *
 182  * @uncompressed_data:  The data that is to be compressed.
 183  * @uncompressed_len:   The length of @uncompressed_data, in bytes.
 184  * @match_tab:          An array for the intermediate representation of matches.
 185  * @record_match:       A function that will be called to produce the
 186  *                              intermediate representation of a match, given
 187  *                              the offset and length.  This function should also
 188  *                              update the appropriate symbol frequency counts
 189  *                              so that any needed Huffman codes can be made
 190  *                              later.
 191  * @record_literal:     A function that will be called to produce the
 192  *                              intermediate representation of a literal, given
 193  *                              the character of the literal.  This function
 194  *                              should also update the appropriate symbol
 195  *                              frequency counts so that any needed Huffman
 196  *                              codes can be made later.
 197  * @record_match_arg_1:
 198  * @record_match_arg_2: Extra arguments to be passed to @record_match.
 199  * @record_literal_arg: Extra arguments to be passed to @record_literal.
 200  * @params:             Structure that contains parameters that affect how the
 201  *                              analysis proceeds (mainly how good the matches
 202  *                              have to be).
 203  *
 204  * Returns the total number of matches and literal bytes that were found; this
 205  * is the number of slots in @match_tab that have been filled with the
 206  * intermediate representation of a match or literal byte.
 207  */
 208 uint lz_analyze_block(const u8 uncompressed_data[], uint uncompressed_len,
 209                         u32 match_tab[], lz_record_match_t record_match,
 210                         lz_record_literal_t record_literal, void *record_match_arg1,
 211                         void *record_match_arg2, void *record_literal_arg,
 212                         const struct lz_params *params)
 213 {
 214         uint cur_match_pos = 0;
 215         uint cur_input_pos = 0;
 216         uint hash          = 0;
 217         uint hash_head     = 0;
 218         uint prev_len      = params->min_match - 1;
 219         uint prev_start;
 220         uint match_len     = params->min_match - 1;
 221         uint match_start   = 0;
 222         bool match_available = false;
 223         u16 hash_tab[HASH_SIZE];
 224         u32 match;
 225         u16 prev_tab[uncompressed_len];
 226
 227         ZERO_ARRAY(hash_tab);
 228         ZERO_ARRAY(prev_tab);
 229
 230         do {
 231                 /* If there are at least 3 characters remaining in the input,
 232                  * insert the 3-character string beginning at
 233                  * uncompressed_data[cur_input_pos] into the hash table.
 234                  *
 235                  * hash_head is set to the index of the previous string in the
 236                  * hash bucket, or 0 if there is no such string */
 237                 if (uncompressed_len - cur_input_pos >= params->min_match) {
 238                         hash = insert_string(hash_tab, prev_tab,
 239                                                 uncompressed_data,
 240                                                 cur_input_pos, hash);
 241                         hash_head = prev_tab[cur_input_pos];
 242                 } else {
 243                         hash_head = 0;
 244                 }
 245
 246
 247                 /* Find the longest match, discarding those <= prev_len. */
 248                 prev_len = match_len;
 249                 prev_start = match_start;
 250                 match_len = params->min_match - 1;
 251
 252                 if (hash_head != 0 && prev_len < params->min_match) {
 253                         /* To simplify the code, we prevent matches with the
 254                          * string of window index 0 (in particular we have to
 255                          * avoid a match of the string with itself at the start
 256                          * of the input file).  */
 257                         match_len = longest_match(uncompressed_data,
 258                                                 uncompressed_len - cur_input_pos,
 259                                                 cur_input_pos, prev_tab,
 260                                                 hash_head, prev_len,
 261                                                 &match_start, params);
 262
 263                         if (match_len == params->min_match &&
 264                                 cur_input_pos - match_start > params->too_far)
 265                         {
 266                                 match_len = params->min_match - 1;
 267                         }
 268                 }
 269
 270                 /* If there was a match at the previous step and the current
 271                  * match is not better, output the previous match:
 272                  */
 273                 if (prev_len >= params->min_match && match_len <= prev_len) {
 274
 275                         /* Do not insert strings in hash table beyond this. */
 276                         uint max_insert = uncompressed_len - params->min_match;
 277
 278                         /*DEBUG("Recording match (pos = %u, offset = %u, len = %u)\n",*/
 279                                         /*cur_input_pos - 1, */
 280                                         /*cur_input_pos - 1 - prev_start,*/
 281                                         /*prev_len);*/
 282
 283                         match = (*record_match)(cur_input_pos - 1 - prev_start,
 284                                                         prev_len,
 285                                                         record_match_arg1,
 286                                                         record_match_arg2);
 287
 288                         match_tab[cur_match_pos++] = match;
 289
 290                         /* Insert in hash table all strings up to the end of the match.
 291                          * strstart-1 and strstart are already inserted. If there is not
 292                          * enough lookahead, the last two strings are not inserted in
 293                          * the hash table.
 294                          */
 295                         prev_len -= 2;
 296
 297                         do {
 298                                 if (++cur_input_pos <= max_insert) {
 299                                         hash = insert_string(hash_tab, prev_tab,
 300                                                                 uncompressed_data,
 301                                                                 cur_input_pos,
 302                                                                 hash);
 303                                 }
 304                         } while (--prev_len != 0);
 305
 306                         match_available = false;
 307                         match_len = params->min_match - 1;
 308                 } else if (match_available) {
 309                         /* If there was no match at the previous position, output a
 310                          * single literal. If there was a match but the current match
 311                          * is longer, truncate the previous match to a single literal.
 312                          */
 313
 314                         /*DEBUG("Recording litrl (pos = %u, value = %u)\n",*/
 315                                         /*cur_input_pos - 1, */
 316                                         /*uncompressed_data[cur_input_pos - 1]);*/
 317
 318                         match = (*record_literal)(
 319                                         uncompressed_data[cur_input_pos - 1],
 320                                                         record_literal_arg);
 321                         match_tab[cur_match_pos++] = match;
 322                 } else {
 323                         /* There is no previous match to compare with, wait for
 324                          * the next step to decide.  */
 325                         match_available = true;
 326                 }
 327         } while (++cur_input_pos < uncompressed_len);
 328
 329         if (match_available) {
 330                 match = (*record_literal)(uncompressed_data[cur_input_pos - 1],
 331                                                 record_literal_arg);
 332                 match_tab[cur_match_pos++] = match;
 333         }
 334         return cur_match_pos;
 335 }