wimlib.net Git - wimlib/blob - src/lz77.c

   1 /*
   2  * lz77.c
   3  *
   4  * This file provides the code to analyze a buffer of uncompressed data for
   5  * matches, as per the LZ77 algorithm.  It uses a hash table to accelerate the
   6  * process.  This is based on code from zlib (v. 1.2.5).
   7  */
   8
   9 /*
  10  * Copyright (C) 2012, 2013 Eric Biggers
  11  * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
  12  *
  13  * This file is part of wimlib, a library for working with WIM files.
  14  *
  15  * wimlib is free software; you can redistribute it and/or modify it under the
  16  * terms of the GNU General Public License as published by the Free
  17  * Software Foundation; either version 3 of the License, or (at your option)
  18  * any later version.
  19  *
  20  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
  21  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  22  * A PARTICULAR PURPOSE. See the GNU General Public License for more
  23  * details.
  24  *
  25  * You should have received a copy of the GNU General Public License
  26  * along with wimlib; if not, see http://www.gnu.org/licenses/.
  27  */
  28
  29 #ifdef HAVE_CONFIG_H
  30 #  include <config.h>
  31 #endif
  32
  33 #include "wimlib/compress_common.h"
  34 #include "wimlib/util.h"
  35
  36 #include <string.h>
  37
  38 #define HASH_BITS       15
  39 #define HASH_SIZE       (1 << HASH_BITS)
  40 #define HASH_MASK       (HASH_SIZE - 1)
  41 #define HASH_SHIFT      5
  42
  43 /* Hash function, based on code from zlib.  This function will update and return
  44  * the hash value @hash for the string ending on the additional input character
  45  * @c.  This function must be called for each consecutive character, because it
  46  * uses a running hash value rather than computing it separately for each
  47  * 3-character string.
  48  *
  49  * The AND operation guarantees that only 3 characters will affect the hash
  50  * value, so every identical 3-character string will have the same hash value.
  51  */
  52 static inline unsigned
  53 update_hash(unsigned hash, u8 c)
  54 {
  55         return ((hash << HASH_SHIFT) ^ c) & HASH_MASK;
  56 }
  57
  58
  59 /* Insert a 3-character string at position @str_pos in @window and with hash
  60  * code @hash into the hash table described by @hash_tab and @prev_tab.  Based
  61  * on code from zlib.
  62  *
  63  * The hash table uses chains (linked lists) for the hash buckets, but there are
  64  * no real pointers involved.  Indexing `hash_tab' by hash value gives the index
  65  * within the window of the last string in the hash bucket.  To find the index
  66  * of the previous string in the hash chain, the `prev_tab' array is indexed by
  67  * the string index.  `prev_tab' can be indexed repeatedly by the string index
  68  * to walk through the hash chain, until the special index `0' is reached,
  69  * indicating the end of the hash chain.
  70  */
  71 static inline unsigned
  72 insert_string(input_idx_t hash_tab[], input_idx_t prev_tab[],
  73               const u8 window[], unsigned str_pos,
  74               unsigned hash)
  75 {
  76         hash = update_hash(hash, window[str_pos + 2]);
  77         prev_tab[str_pos] = hash_tab[hash];
  78         hash_tab[hash] = str_pos;
  79         return hash;
  80 }
  81
  82
  83 /*
  84  * Returns the longest match for a given input position.
  85  *
  86  * @window:             The window of uncompressed data.
  87  * @bytes_remaining:    The number of bytes remaining in the window.
  88  * @strstart:           The index of the start of the string in the window that
  89  *                              we are trying to find a match for.
  90  * @prev_tab:           The array of prev pointers for the hash table.
  91  * @cur_match:          The index of the head of the hash chain for matches
  92  *                              having the hash value of the string beginning
  93  *                              at index @strstart.
  94  * @prev_len:           The length of the match that was found for the string
  95  *                              beginning at (@strstart - 1).
  96  * @match_start_ret:    A location into which the index of the start of the
  97  *                              match will be returned.
  98  * @params:             Parameters that affect how long the search will proceed
  99  *                              before going with the best that has been found
 100  *                              so far.
 101  * @min_start_pos:      If the chain reaches a match starting before this
 102  *                      position (including the end-of-chain 0), the search will
 103  *                      be terminated.
 104  *
 105  * Returns the length of the match that was found.
 106  */
 107 static unsigned
 108 longest_match(const u8 window[], unsigned bytes_remaining,
 109               unsigned strstart, const input_idx_t prev_tab[],
 110               unsigned cur_match, unsigned prev_len,
 111               unsigned *match_start_ret,
 112               const struct lz_params *params,
 113               unsigned min_start_pos)
 114 {
 115         unsigned chain_len = params->max_chain_len;
 116
 117         const u8 *scan = window + strstart;
 118         const u8 *match;
 119         unsigned len;
 120         unsigned best_len = prev_len;
 121         unsigned match_start = cur_match;
 122
 123         unsigned nice_match = min(params->nice_match, bytes_remaining);
 124
 125         const u8 *strend = scan + min(params->max_match, bytes_remaining);
 126
 127         u8 scan_end1 = scan[best_len - 1];
 128         u8 scan_end = scan[best_len];
 129
 130
 131         /* Do not waste too much time if we already have a good match: */
 132         if (best_len >= params->good_match)
 133                 chain_len >>= 2;
 134
 135         do {
 136                 match = &window[cur_match];
 137
 138                 /* Skip to next match if the match length cannot increase or if
 139                  * the match length is less than 2.  Note that the checks below
 140                  * for insufficient lookahead only occur occasionally for
 141                  * performance reasons.  Therefore uninitialized memory will be
 142                  * accessed, and conditional jumps will be made that depend on
 143                  * those values.  However the length of the match is limited to
 144                  * the lookahead, so the output of lz_analyze_block() is not
 145                  * affected by the uninitialized values.  */
 146
 147                 if (match[best_len] != scan_end
 148                     || match[best_len - 1] != scan_end1
 149                     || *match != *scan
 150                     || *++match != scan[1])
 151                         continue;
 152                 scan++;
 153
 154         #if 0
 155                 do {
 156                 } while (scan < strend && *++match == *++scan);
 157         #else
 158
 159                 do {
 160                 } while (
 161                          *++match == *++scan && *++match == *++scan &&
 162                          *++match == *++scan && *++match == *++scan &&
 163                          *++match == *++scan && *++match == *++scan &&
 164                          *++match == *++scan && *++match == *++scan &&
 165                          scan < strend);
 166         #endif
 167                 len = match - &window[cur_match];
 168
 169                 scan = &window[strstart];
 170
 171                 if (len > best_len) {
 172                         match_start = cur_match;
 173                         best_len = len;
 174                         if (len >= nice_match)
 175                                 break;
 176                         scan_end1  = scan[best_len - 1];
 177                         scan_end   = scan[best_len];
 178                 }
 179         } while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) >= min_start_pos);
 180         *match_start_ret = match_start;
 181         return min(min(best_len, bytes_remaining), params->max_match);
 182 }
 183
 184
 185
 186 /*
 187  * Determines the sequence of matches and literals that a block of data will be
 188  * compressed to.
 189  *
 190  * @window:             The data that is to be compressed.
 191  * @window_size:        The length of @window, in bytes.
 192  * @record_match:       Consumer for matches.
 193  * @record_literal:     Consumer for literals.
 194  * @record_ctx:         Context passed to @record_match and @record_literal.
 195  * @params:             Structure that contains parameters that affect how the
 196  *                              analysis proceeds (mainly how good the matches
 197  *                              have to be).
 198  * @prev_tab:           Temporary space containing least @window_size elements.
 199  */
 200 void
 201 lz_analyze_block(const u8 window[restrict],
 202                  input_idx_t window_size,
 203                  lz_record_match_t record_match,
 204                  lz_record_literal_t record_literal,
 205                  void *record_ctx,
 206                  const struct lz_params *params,
 207                  input_idx_t prev_tab[restrict])
 208 {
 209         unsigned cur_input_pos = 0;
 210         unsigned hash          = 0;
 211         unsigned hash_head     = 0;
 212         unsigned prev_len      = params->min_match - 1;
 213         unsigned prev_start;
 214         unsigned match_len     = params->min_match - 1;
 215         unsigned match_start   = 0;
 216         bool match_available = false;
 217         input_idx_t hash_tab[HASH_SIZE];
 218         unsigned min_start_pos = 1;
 219
 220         ZERO_ARRAY(hash_tab);
 221
 222         do {
 223                 /* If there are at least 3 characters remaining in the input,
 224                  * insert the 3-character string beginning at
 225                  * window[cur_input_pos] into the hash table.
 226                  *
 227                  * hash_head is set to the index of the previous string in the
 228                  * hash bucket, or 0 if there is no such string */
 229                 if (window_size - cur_input_pos >= params->min_match) {
 230                         hash = insert_string(hash_tab, prev_tab,
 231                                              window,
 232                                              cur_input_pos, hash);
 233                         hash_head = prev_tab[cur_input_pos];
 234                 } else {
 235                         hash_head = 0;
 236                 }
 237
 238
 239                 /* Find the longest match, discarding those <= prev_len. */
 240                 prev_len = match_len;
 241                 prev_start = match_start;
 242                 match_len = params->min_match - 1;
 243
 244                 if (cur_input_pos > params->max_offset)
 245                         min_start_pos = cur_input_pos - params->max_offset;
 246                 else
 247                         min_start_pos = 1;
 248
 249                 if (hash_head >= min_start_pos &&
 250                     prev_len < params->max_lazy_match)
 251                 {
 252                         /* To simplify the code, we prevent matches with the
 253                          * string of window index 0 (in particular we have to
 254                          * avoid a match of the string with itself at the start
 255                          * of the input file).  */
 256                         match_len = longest_match(window,
 257                                                   window_size - cur_input_pos,
 258                                                   cur_input_pos, prev_tab,
 259                                                   hash_head, prev_len,
 260                                                   &match_start, params,
 261                                                   min_start_pos);
 262
 263                         if (match_len == params->min_match &&
 264                              cur_input_pos - match_start > params->too_far)
 265                                 match_len = params->min_match - 1;
 266                 }
 267
 268                 /* If there was a match at the previous step and the current
 269                  * match is not better, output the previous match:
 270                  */
 271                 if (prev_len >= params->min_match && match_len <= prev_len) {
 272
 273
 274                         (*record_match)(prev_len,
 275                                         cur_input_pos - 1 - prev_start,
 276                                         record_ctx);
 277
 278                         /* Insert in hash table all strings up to the end of the
 279                          * match.  strstart-1 and strstart are already inserted.
 280                          * If there is not enough lookahead, the last two
 281                          * strings are not inserted in the hash table.  */
 282
 283                         /* Do not insert strings in hash table beyond this. */
 284                         unsigned max_insert = window_size - params->min_match;
 285
 286                         prev_len -= 2;
 287
 288                         do {
 289                                 if (++cur_input_pos <= max_insert) {
 290                                         hash = insert_string(hash_tab, prev_tab,
 291                                                              window,
 292                                                              cur_input_pos,
 293                                                              hash);
 294                                 }
 295                         } while (--prev_len != 0);
 296                         match_available = false;
 297                         match_len = params->min_match - 1;
 298                 } else if (match_available) {
 299                         /* If there was no match at the previous position,
 300                          * output a single literal. If there was a match but the
 301                          * current match is longer, truncate the previous match
 302                          * to a single literal.  */
 303                         (*record_literal)(window[cur_input_pos - 1], record_ctx);
 304                 } else {
 305                         /* There is no previous match to compare with, wait for
 306                          * the next step to decide.  */
 307                         match_available = true;
 308                 }
 309         } while (++cur_input_pos < window_size);
 310
 311         if (match_available)
 312                 (*record_literal)(window[cur_input_pos - 1], record_ctx);
 313 }