wimlib.net Git - wimlib/blob - src/lz77.c

   1 /*
   2  * lz.c
   3  *
   4  * This file provides the code to analyze a buffer of uncompressed data for
   5  * matches, as per the LZ77 algorithm.  It uses a hash table to accelerate the
   6  * process.  This is based on code from zlib (v. 1.2.5).
   7  */
   8
   9 /*
  10  * Copyright (C) 2012, 2013 Eric Biggers
  11  * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
  12  *
  13  * This file is part of wimlib, a library for working with WIM files.
  14  *
  15  * wimlib is free software; you can redistribute it and/or modify it under the
  16  * terms of the GNU General Public License as published by the Free
  17  * Software Foundation; either version 3 of the License, or (at your option)
  18  * any later version.
  19  *
  20  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
  21  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  22  * A PARTICULAR PURPOSE. See the GNU General Public License for more
  23  * details.
  24  *
  25  * You should have received a copy of the GNU General Public License
  26  * along with wimlib; if not, see http://www.gnu.org/licenses/.
  27  */
  28
  29 #include "compress.h"
  30 #include <string.h>
  31
  32 #define LZ_MIN_MATCH 3
  33
  34 #define HASH_BITS       15
  35 #define HASH_SIZE       (1 << HASH_BITS)
  36 #define HASH_MASK       (HASH_SIZE - 1)
  37
  38 #if LZ_MIN_MATCH == 2
  39 #       define HASH_SHIFT       8
  40 #elif LZ_MIN_MATCH == 3
  41 #       define HASH_SHIFT       5
  42 #else
  43 #error "Invalid LZ_MIN_MATCH"
  44 #endif
  45
  46 /* Hash function, based on code from zlib.  This function will update and return
  47  * the hash value @hash for the string ending on the additional input character
  48  * @c.  This function must be called for each consecutive character, because it
  49  * uses a running hash value rather than computing it separately for each
  50  * 3-character string.
  51  *
  52  * The AND operation guarantees that only 3 characters will affect the hash
  53  * value, so every identical 3-character string will have the same hash value.
  54  */
  55 static inline unsigned update_hash(unsigned hash, u8 c)
  56 {
  57         return ((hash << HASH_SHIFT) ^ c) & HASH_MASK;
  58 }
  59
  60
  61 /* Insert a 3-character string at position @str_pos in @window and with hash
  62  * code @hash into the hash table described by @hash_tab and @prev_tab.  Based
  63  * on code from zlib.
  64  *
  65  * The hash table uses chains (linked lists) for the hash buckets, but there are
  66  * no real pointers involved.  Indexing `hash_tab' by hash value gives the index
  67  * within the window of the last string in the hash bucket.  To find the index
  68  * of the previous string in the hash chain, the `prev_tab' array is indexed by
  69  * the string index.  `prev_tab' can be indexed repeatedly by the string index
  70  * to walk through the hash chain, until the special index `0' is reached,
  71  * indicating the end of the hash chain.
  72  */
  73 static inline unsigned insert_string(u16 hash_tab[], u16 prev_tab[],
  74                                      const u8 window[], unsigned str_pos,
  75                                      unsigned hash)
  76 {
  77         hash = update_hash(hash, window[str_pos + LZ_MIN_MATCH - 1]);
  78         prev_tab[str_pos] = hash_tab[hash];
  79         hash_tab[hash] = str_pos;
  80         return hash;
  81 }
  82
  83
  84 /*
  85  * Returns the longest match for a given input position.
  86  *
  87  * @window:             The window of uncompressed data.
  88  * @bytes_remaining:    The number of bytes remaining in the window.
  89  * @strstart:           The index of the start of the string in the window that
  90  *                              we are trying to find a match for.
  91  * @prev_tab:           The array of prev pointers for the hash table.
  92  * @cur_match:          The index of the head of the hash chain for matches
  93  *                              having the hash value of the string beginning
  94  *                              at index @strstart.
  95  * @prev_len:           The length of the match that was found for the string
  96  *                              beginning at (@strstart - 1).
  97  * @match_start_ret:    A location into which the index of the start of the
  98  *                              match will be returned.
  99  * @params:             Parameters that affect how long the search will proceed
 100  *                              before going with the best that has been found
 101  *                              so far.
 102  *
 103  * Returns the length of the match that was found.
 104  */
 105 static unsigned longest_match(const u8 window[], unsigned bytes_remaining,
 106                               unsigned strstart, const u16 prev_tab[],
 107                               unsigned cur_match, unsigned prev_len,
 108                               unsigned *match_start_ret,
 109                               const struct lz_params *params)
 110 {
 111         unsigned chain_len = params->max_chain_len;
 112
 113         const u8 *scan = window + strstart;
 114         const u8 *match;
 115         unsigned len;
 116         unsigned best_len = prev_len;
 117         unsigned match_start = cur_match;
 118
 119         unsigned nice_match = min(params->nice_match, bytes_remaining);
 120
 121         const u8 *strend = scan + min(params->max_match, bytes_remaining);
 122
 123         u8 scan_end1 = scan[best_len - 1];
 124         u8 scan_end = scan[best_len];
 125
 126
 127         /* Do not waste too much time if we already have a good match: */
 128         if (best_len >= params->good_match)
 129                 chain_len >>= 2;
 130
 131         do {
 132                 match = &window[cur_match];
 133
 134                 /* Skip to next match if the match length cannot increase or if
 135                  * the match length is less than 2.  Note that the checks below
 136                  * for insufficient lookahead only occur occasionally for
 137                  * performance reasons.  Therefore uninitialized memory will be
 138                  * accessed, and conditional jumps will be made that depend on
 139                  * those values.  However the length of the match is limited to
 140                  * the lookahead, so the output of deflate is not affected by
 141                  * the uninitialized values.
 142                  */
 143
 144                 if (match[best_len] != scan_end
 145                     || match[best_len - 1] != scan_end1
 146                     || *match != *scan
 147                     || *++match != scan[1])
 148                         continue;
 149                 scan++;
 150
 151         #if 0
 152                 do {
 153                 } while (scan < strend && *++match == *++scan);
 154         #else
 155
 156                 do {
 157                 } while (
 158                          *++match == *++scan && *++match == *++scan &&
 159                          *++match == *++scan && *++match == *++scan &&
 160                          *++match == *++scan && *++match == *++scan &&
 161                          *++match == *++scan && *++match == *++scan &&
 162                          scan < strend);
 163         #endif
 164                 len = match - &window[cur_match];
 165
 166                 scan = &window[strstart];
 167
 168                 if (len > best_len) {
 169                         match_start = cur_match;
 170                         best_len = len;
 171                         if (len >= nice_match)
 172                                 break;
 173                         scan_end1  = scan[best_len - 1];
 174                         scan_end   = scan[best_len];
 175                 }
 176         } while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) != 0);
 177         *match_start_ret = match_start;
 178         return min(min(best_len, bytes_remaining), params->max_match);
 179 }
 180
 181
 182
 183 /*
 184  * Determines the sequence of matches and literals that a block of data will be
 185  * compressed to.
 186  *
 187  * @uncompressed_data:  The data that is to be compressed.
 188  * @uncompressed_len:   The length of @uncompressed_data, in bytes.
 189  * @match_tab:          An array for the intermediate representation of matches.
 190  * @record_match:       A function that will be called to produce the
 191  *                              intermediate representation of a match, given
 192  *                              the offset and length.  This function should also
 193  *                              update the appropriate symbol frequency counts
 194  *                              so that any needed Huffman codes can be made
 195  *                              later.
 196  * @record_literal:     A function that will be called to produce the
 197  *                              intermediate representation of a literal, given
 198  *                              the character of the literal.  This function
 199  *                              should also update the appropriate symbol
 200  *                              frequency counts so that any needed Huffman
 201  *                              codes can be made later.
 202  * @record_match_arg_1:
 203  * @record_match_arg_2: Extra arguments to be passed to @record_match.
 204  * @record_literal_arg: Extra arguments to be passed to @record_literal.
 205  * @params:             Structure that contains parameters that affect how the
 206  *                              analysis proceeds (mainly how good the matches
 207  *                              have to be).
 208  *
 209  * Returns the total number of matches and literal bytes that were found; this
 210  * is the number of slots in @match_tab that have been filled with the
 211  * intermediate representation of a match or literal byte.
 212  */
 213 unsigned lz_analyze_block(const u8 uncompressed_data[],
 214                           unsigned uncompressed_len,
 215                           u32 match_tab[],
 216                           lz_record_match_t record_match,
 217                           lz_record_literal_t record_literal,
 218                           void *record_match_arg1,
 219                           void *record_match_arg2,
 220                           void *record_literal_arg,
 221                           const struct lz_params *params)
 222 {
 223         unsigned cur_match_pos = 0;
 224         unsigned cur_input_pos = 0;
 225         unsigned hash          = 0;
 226         unsigned hash_head     = 0;
 227         unsigned prev_len      = params->min_match - 1;
 228         unsigned prev_start;
 229         unsigned match_len     = params->min_match - 1;
 230         unsigned match_start   = 0;
 231         bool match_available = false;
 232         u16 hash_tab[HASH_SIZE];
 233         u32 match;
 234         u16 prev_tab[uncompressed_len];
 235
 236         ZERO_ARRAY(hash_tab);
 237         ZERO_ARRAY(prev_tab);
 238
 239         do {
 240                 /* If there are at least 3 characters remaining in the input,
 241                  * insert the 3-character string beginning at
 242                  * uncompressed_data[cur_input_pos] into the hash table.
 243                  *
 244                  * hash_head is set to the index of the previous string in the
 245                  * hash bucket, or 0 if there is no such string */
 246                 if (uncompressed_len - cur_input_pos >= params->min_match) {
 247                         hash = insert_string(hash_tab, prev_tab,
 248                                              uncompressed_data,
 249                                              cur_input_pos, hash);
 250                         hash_head = prev_tab[cur_input_pos];
 251                 } else {
 252                         hash_head = 0;
 253                 }
 254
 255
 256                 /* Find the longest match, discarding those <= prev_len. */
 257                 prev_len = match_len;
 258                 prev_start = match_start;
 259                 match_len = params->min_match - 1;
 260
 261                 if (hash_head != 0 && prev_len < params->max_lazy_match) {
 262                         /* To simplify the code, we prevent matches with the
 263                          * string of window index 0 (in particular we have to
 264                          * avoid a match of the string with itself at the start
 265                          * of the input file).  */
 266                         match_len = longest_match(uncompressed_data,
 267                                                   uncompressed_len - cur_input_pos,
 268                                                   cur_input_pos, prev_tab,
 269                                                   hash_head, prev_len,
 270                                                   &match_start, params);
 271
 272                         if (match_len == params->min_match &&
 273                              cur_input_pos - match_start > params->too_far)
 274                                 match_len = params->min_match - 1;
 275                 }
 276
 277                 /* If there was a match at the previous step and the current
 278                  * match is not better, output the previous match:
 279                  */
 280                 if (prev_len >= params->min_match && match_len <= prev_len) {
 281
 282                         /* Do not insert strings in hash table beyond this. */
 283                         unsigned max_insert = uncompressed_len - params->min_match;
 284
 285                         /*DEBUG("Recording match (pos = %u, offset = %u, len = %u)\n",*/
 286                                         /*cur_input_pos - 1, */
 287                                         /*cur_input_pos - 1 - prev_start,*/
 288                                         /*prev_len);*/
 289
 290                         match = (*record_match)(cur_input_pos - 1 - prev_start,
 291                                                 prev_len,
 292                                                 record_match_arg1,
 293                                                 record_match_arg2);
 294
 295                         match_tab[cur_match_pos++] = match;
 296
 297                         /* Insert in hash table all strings up to the end of the match.
 298                          * strstart-1 and strstart are already inserted. If there is not
 299                          * enough lookahead, the last two strings are not inserted in
 300                          * the hash table.
 301                          */
 302 #if LZ_MIN_MATCH == 2
 303                         if (prev_len >= 3)
 304 #endif
 305                         {
 306                                 prev_len -= 2;
 307
 308                                 do {
 309                                         if (++cur_input_pos <= max_insert) {
 310                                                 hash = insert_string(hash_tab, prev_tab,
 311                                                                      uncompressed_data,
 312                                                                      cur_input_pos,
 313                                                                      hash);
 314                                         }
 315                                 } while (--prev_len != 0);
 316                         }
 317                         match_available = false;
 318                         match_len = params->min_match - 1;
 319                 } else if (match_available) {
 320                         /* If there was no match at the previous position, output a
 321                          * single literal. If there was a match but the current match
 322                          * is longer, truncate the previous match to a single literal.
 323                          */
 324
 325                         /*DEBUG("Recording litrl (pos = %u, value = %u)\n",*/
 326                                         /*cur_input_pos - 1, */
 327                                         /*uncompressed_data[cur_input_pos - 1]);*/
 328
 329                         match = (*record_literal)(
 330                                         uncompressed_data[cur_input_pos - 1],
 331                                                         record_literal_arg);
 332                         match_tab[cur_match_pos++] = match;
 333                 } else {
 334                         /* There is no previous match to compare with, wait for
 335                          * the next step to decide.  */
 336                         match_available = true;
 337                 }
 338         } while (++cur_input_pos < uncompressed_len);
 339
 340         if (match_available) {
 341                 match = (*record_literal)(uncompressed_data[cur_input_pos - 1],
 342                                                 record_literal_arg);
 343                 match_tab[cur_match_pos++] = match;
 344         }
 345         return cur_match_pos;
 346 }