4 * This file provides the code to analyze a buffer of uncompressed data for
5 * matches, as per the LZ77 algorithm. It uses a hash table to accelerate the
6 * process. This is based on code from zlib (v. 1.2.5).
10 * Copyright (C) 2012, 2013 Eric Biggers
11 * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
13 * This file is part of wimlib, a library for working with WIM files.
15 * wimlib is free software; you can redistribute it and/or modify it under the
16 * terms of the GNU General Public License as published by the Free
17 * Software Foundation; either version 3 of the License, or (at your option)
20 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
21 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 * A PARTICULAR PURPOSE. See the GNU General Public License for more
25 * You should have received a copy of the GNU General Public License
26 * along with wimlib; if not, see http://www.gnu.org/licenses/.
33 #include "wimlib/compress_common.h"
34 #include "wimlib/util.h"
39 #define HASH_SIZE (1 << HASH_BITS)
40 #define HASH_MASK (HASH_SIZE - 1)
43 /* Hash function, based on code from zlib. This function will update and return
44 * the hash value @hash for the string ending on the additional input character
45 * @c. This function must be called for each consecutive character, because it
46 * uses a running hash value rather than computing it separately for each
49 * The AND operation guarantees that only 3 characters will affect the hash
50 * value, so every identical 3-character string will have the same hash value.
52 static inline unsigned
53 update_hash(unsigned hash, u8 c)
55 return ((hash << HASH_SHIFT) ^ c) & HASH_MASK;
59 /* Insert a 3-character string at position @str_pos in @window and with hash
60 * code @hash into the hash table described by @hash_tab and @prev_tab. Based
63 * The hash table uses chains (linked lists) for the hash buckets, but there are
64 * no real pointers involved. Indexing `hash_tab' by hash value gives the index
65 * within the window of the last string in the hash bucket. To find the index
66 * of the previous string in the hash chain, the `prev_tab' array is indexed by
67 * the string index. `prev_tab' can be indexed repeatedly by the string index
68 * to walk through the hash chain, until the special index `0' is reached,
69 * indicating the end of the hash chain.
71 static inline unsigned
72 insert_string(input_idx_t hash_tab[], input_idx_t prev_tab[],
73 const u8 window[], unsigned str_pos,
76 hash = update_hash(hash, window[str_pos + 2]);
77 prev_tab[str_pos] = hash_tab[hash];
78 hash_tab[hash] = str_pos;
84 * Returns the longest match for a given input position.
86 * @window: The window of uncompressed data.
87 * @bytes_remaining: The number of bytes remaining in the window.
88 * @strstart: The index of the start of the string in the window that
89 * we are trying to find a match for.
90 * @prev_tab: The array of prev pointers for the hash table.
91 * @cur_match: The index of the head of the hash chain for matches
92 * having the hash value of the string beginning
94 * @prev_len: The length of the match that was found for the string
95 * beginning at (@strstart - 1).
96 * @match_start_ret: A location into which the index of the start of the
97 * match will be returned.
98 * @params: Parameters that affect how long the search will proceed
99 * before going with the best that has been found
101 * @min_start_pos: If the chain reaches a match starting before this
102 * position (including the end-of-chain 0), the search will
105 * Returns the length of the match that was found.
108 longest_match(const u8 window[], unsigned bytes_remaining,
109 unsigned strstart, const input_idx_t prev_tab[],
110 unsigned cur_match, unsigned prev_len,
111 unsigned *match_start_ret,
112 const struct lz_params *params,
113 unsigned min_start_pos)
115 unsigned chain_len = params->max_chain_len;
117 const u8 *scan = window + strstart;
120 unsigned best_len = prev_len;
121 unsigned match_start = cur_match;
123 unsigned nice_match = min(params->nice_match, bytes_remaining);
125 const u8 *strend = scan + min(params->max_match, bytes_remaining);
127 u8 scan_end1 = scan[best_len - 1];
128 u8 scan_end = scan[best_len];
131 /* Do not waste too much time if we already have a good match: */
132 if (best_len >= params->good_match)
136 match = &window[cur_match];
138 /* Skip to next match if the match length cannot increase or if
139 * the match length is less than 2. Note that the checks below
140 * for insufficient lookahead only occur occasionally for
141 * performance reasons. Therefore uninitialized memory will be
142 * accessed, and conditional jumps will be made that depend on
143 * those values. However the length of the match is limited to
144 * the lookahead, so the output of lz_analyze_block() is not
145 * affected by the uninitialized values. */
147 if (match[best_len] != scan_end
148 || match[best_len - 1] != scan_end1
150 || *++match != scan[1])
156 } while (scan < strend && *++match == *++scan);
161 *++match == *++scan && *++match == *++scan &&
162 *++match == *++scan && *++match == *++scan &&
163 *++match == *++scan && *++match == *++scan &&
164 *++match == *++scan && *++match == *++scan &&
167 len = match - &window[cur_match];
169 scan = &window[strstart];
171 if (len > best_len) {
172 match_start = cur_match;
174 if (len >= nice_match)
176 scan_end1 = scan[best_len - 1];
177 scan_end = scan[best_len];
179 } while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) >= min_start_pos);
180 *match_start_ret = match_start;
181 return min(min(best_len, bytes_remaining), params->max_match);
187 * Determines the sequence of matches and literals that a block of data will be
190 * @window: The data that is to be compressed.
191 * @window_size: The length of @window, in bytes.
192 * @record_match: Consumer for matches.
193 * @record_literal: Consumer for literals.
194 * @record_ctx: Context passed to @record_match and @record_literal.
195 * @params: Structure that contains parameters that affect how the
196 * analysis proceeds (mainly how good the matches
198 * @prev_tab: Temporary space containing least @window_size elements.
201 lz_analyze_block(const u8 window[restrict],
202 input_idx_t window_size,
203 lz_record_match_t record_match,
204 lz_record_literal_t record_literal,
206 const struct lz_params *params,
207 input_idx_t prev_tab[restrict])
209 unsigned cur_input_pos = 0;
211 unsigned hash_head = 0;
212 unsigned prev_len = params->min_match - 1;
214 unsigned match_len = params->min_match - 1;
215 unsigned match_start = 0;
216 bool match_available = false;
217 input_idx_t hash_tab[HASH_SIZE];
218 unsigned min_start_pos = 1;
220 ZERO_ARRAY(hash_tab);
223 /* If there are at least 3 characters remaining in the input,
224 * insert the 3-character string beginning at
225 * window[cur_input_pos] into the hash table.
227 * hash_head is set to the index of the previous string in the
228 * hash bucket, or 0 if there is no such string */
229 if (window_size - cur_input_pos >= params->min_match) {
230 hash = insert_string(hash_tab, prev_tab,
232 cur_input_pos, hash);
233 hash_head = prev_tab[cur_input_pos];
239 /* Find the longest match, discarding those <= prev_len. */
240 prev_len = match_len;
241 prev_start = match_start;
242 match_len = params->min_match - 1;
244 if (cur_input_pos > params->max_offset)
245 min_start_pos = cur_input_pos - params->max_offset;
249 if (hash_head >= min_start_pos &&
250 prev_len < params->max_lazy_match)
252 /* To simplify the code, we prevent matches with the
253 * string of window index 0 (in particular we have to
254 * avoid a match of the string with itself at the start
255 * of the input file). */
256 match_len = longest_match(window,
257 window_size - cur_input_pos,
258 cur_input_pos, prev_tab,
260 &match_start, params,
263 if (match_len == params->min_match &&
264 cur_input_pos - match_start > params->too_far)
265 match_len = params->min_match - 1;
268 /* If there was a match at the previous step and the current
269 * match is not better, output the previous match:
271 if (prev_len >= params->min_match && match_len <= prev_len) {
274 (*record_match)(prev_len,
275 cur_input_pos - 1 - prev_start,
278 /* Insert in hash table all strings up to the end of the
279 * match. strstart-1 and strstart are already inserted.
280 * If there is not enough lookahead, the last two
281 * strings are not inserted in the hash table. */
283 /* Do not insert strings in hash table beyond this. */
284 unsigned max_insert = window_size - params->min_match;
289 if (++cur_input_pos <= max_insert) {
290 hash = insert_string(hash_tab, prev_tab,
295 } while (--prev_len != 0);
296 match_available = false;
297 match_len = params->min_match - 1;
298 } else if (match_available) {
299 /* If there was no match at the previous position,
300 * output a single literal. If there was a match but the
301 * current match is longer, truncate the previous match
302 * to a single literal. */
303 (*record_literal)(window[cur_input_pos - 1], record_ctx);
305 /* There is no previous match to compare with, wait for
306 * the next step to decide. */
307 match_available = true;
309 } while (++cur_input_pos < window_size);
312 (*record_literal)(window[cur_input_pos - 1], record_ctx);