4 * This file provides the code to analyze a buffer of uncompressed data for
5 * matches, as per the LZ77 algorithm. It uses a hash table to accelerate the
6 * process. This is based on code from zlib (v. 1.2.5).
8 * Copyright (C) 2012 Eric Biggers
9 * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
11 * wimlib - Library for working with WIM files
13 * This library is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2.1 of the License, or (at your option) any
18 * This library is distributed in the hope that it will be useful, but WITHOUT ANY
19 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
20 * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public License along
23 * with this library; if not, write to the Free Software Foundation, Inc., 59
24 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
31 #define HASH_SIZE (1 << HASH_BITS)
32 #define HASH_MASK (HASH_SIZE - 1)
33 #define HASH_SHIFT ((HASH_BITS + 2) / 3)
35 /* Hash function, based on code from zlib. This function will update and return
36 * the hash value @hash for the string ending on the additional input character
37 * @c. This function must be called for each consecutive character, because it
38 * uses a running hash value rather than computing it separately for each
41 * The AND operation guarantees that only 3 characters will affect the hash
42 * value, so every identical 3-character string will have the same hash value.
44 static inline uint update_hash(uint hash, u8 c)
46 return ((hash << HASH_SHIFT) ^ c) & HASH_MASK;
50 /* Insert a 3-character string at position @str_pos in @window and with hash
51 * code @hash into the hash table described by @hash_tab and @prev_tab. Based
54 * The hash table uses chains (linked lists) for the hash buckets, but there are
55 * no real pointers involved. Indexing `hash_tab' by hash value gives the index
56 * within the window of the last string in the hash bucket. To find the index
57 * of the previous string in the hash chain, the `prev_tab' array is indexed by
58 * the string index. `prev_tab' can be indexed repeatedly by the string index
59 * to walk through the hash chain, until the special index `0' is reached,
60 * indicating the end of the hash chain.
62 static inline uint insert_string(u16 hash_tab[], u16 prev_tab[],
63 const u8 window[], uint str_pos,
66 hash = update_hash(hash, window[str_pos + 2]);
67 prev_tab[str_pos] = hash_tab[hash];
68 hash_tab[hash] = str_pos;
74 * Returns the longest match for a given input position.
76 * @window: The window of uncompressed data.
77 * @bytes_remaining: The number of bytes remaining in the window.
78 * @strstart: The index of the start of the string in the window that
79 * we are trying to find a match for.
80 * @prev_tab: The array of prev pointers for the hash table.
81 * @cur_match: The index of the head of the hash chain for matches
82 * having the hash value of the string beginning
84 * @prev_len: The length of the match that was found for the string
85 * beginning at (@strstart - 1).
86 * @match_start_ret: A location into which the index of the start of the
87 * match will be returned.
88 * @params: Parameters that affect how long the search will proceed
89 * before going with the best that has been found
92 * Returns the length of the match that was found.
94 static uint longest_match(const u8 window[], uint bytes_remaining,
95 uint strstart, const u16 prev_tab[],
96 uint cur_match, uint prev_len,
97 uint *match_start_ret,
98 const struct lz_params *params)
100 uint chain_len = params->max_chain_len;
102 const u8 *scan = window + strstart;
105 uint best_len = prev_len;
106 uint match_start = cur_match;
108 uint nice_match = min(params->nice_match, bytes_remaining);
110 const u8 *strend = scan + params->max_match;
112 u8 scan_end1 = scan[best_len - 1];
113 u8 scan_end = scan[best_len];
116 /* Do not waste too much time if we already have a good match: */
117 if (best_len >= params->good_match)
121 match = &window[cur_match];
124 /* Skip to next match if the match length cannot increase
125 * or if the match length is less than 2. Note that the checks below
126 * for insufficient lookahead only occur occasionally for performance
127 * reasons. Therefore uninitialized memory will be accessed, and
128 * conditional jumps will be made that depend on those values.
129 * However the length of the match is limited to the lookahead, so
130 * the output of deflate is not affected by the uninitialized values.
133 if (match[best_len] != scan_end ||
134 match[best_len-1] != scan_end1 ||
136 *++match != scan[1]) continue;
138 /* The check at best_len-1 can be removed because it will be made
139 * again later. (This heuristic is not always a win.)
140 * It is not necessary to compare scan[2] and match[2] since they
141 * are always equal when the other bytes match, given that
142 * the hash keys are equal and that HASH_BITS >= 8.
146 wimlib_assert(*scan == *match);
148 /* We check for insufficient lookahead only every 8th comparison;
149 * the 256th check will be made at strstart+258. */
151 } while (*++scan == *++match && *++scan == *++match &&
152 *++scan == *++match && *++scan == *++match &&
153 *++scan == *++match && *++scan == *++match &&
154 *++scan == *++match && *++scan == *++match &&
157 len = params->max_match - (int)(strend - scan);
158 scan = strend - params->max_match;
160 if (len > best_len) {
161 match_start = cur_match;
163 if (len >= nice_match)
165 scan_end1 = scan[best_len - 1];
166 scan_end = scan[best_len];
168 cur_match = prev_tab[cur_match];
169 } while (--chain_len != 0 && cur_match != 0);
170 *match_start_ret = match_start;
171 return min(min(best_len, bytes_remaining), params->max_match);
177 * Determines the sequence of matches and literals that a block of data will be
180 * @uncompressed_data: The data that is to be compressed.
181 * @uncompressed_len: The length of @uncompressed_data, in bytes.
182 * @match_tab: An array for the intermediate representation of matches.
183 * @record_match: A function that will be called to produce the
184 * intermediate representation of a match, given
185 * the offset and length. This function should also
186 * update the appropriate symbol frequency counts
187 * so that any needed Huffman codes can be made
189 * @record_literal: A function that will be called to produce the
190 * intermediate representation of a literal, given
191 * the character of the literal. This function
192 * should also update the appropriate symbol
193 * frequency counts so that any needed Huffman
194 * codes can be made later.
195 * @record_match_arg_1:
196 * @record_match_arg_2: Extra arguments to be passed to @record_match.
197 * @record_literal_arg: Extra arguments to be passed to @record_literal.
198 * @params: Structure that contains parameters that affect how the
199 * analysis proceeds (mainly how good the matches
202 * Returns the total number of matches and literal bytes that were found; this
203 * is the number of slots in @match_tab that have been filled with the
204 * intermediate representation of a match or literal byte.
206 uint lz_analyze_block(const u8 uncompressed_data[], uint uncompressed_len,
207 u32 match_tab[], lz_record_match_t record_match,
208 lz_record_literal_t record_literal, void *record_match_arg1,
209 void *record_match_arg2, void *record_literal_arg,
210 const struct lz_params *params)
212 uint cur_match_pos = 0;
213 uint cur_input_pos = 0;
216 uint prev_len = params->min_match - 1;
218 uint match_len = params->min_match - 1;
219 uint match_start = 0;
220 bool match_available = false;
221 u16 hash_tab[HASH_SIZE];
223 u16 prev_tab[uncompressed_len];
225 ZERO_ARRAY(hash_tab);
226 ZERO_ARRAY(prev_tab);
229 /* If there are at least 3 characters remaining in the input,
230 * insert the 3-character string beginning at
231 * uncompressed_data[cur_input_pos] into the hash table.
233 * hash_head is set to the index of the previous string in the
234 * hash bucket, or 0 if there is no such string */
235 if (uncompressed_len - cur_input_pos >= params->min_match) {
236 hash = insert_string(hash_tab, prev_tab,
238 cur_input_pos, hash);
239 hash_head = prev_tab[cur_input_pos];
245 /* Find the longest match, discarding those <= prev_len. */
246 prev_len = match_len;
247 prev_start = match_start;
248 match_len = params->min_match - 1;
250 if (hash_head != 0 && prev_len < params->min_match) {
251 /* To simplify the code, we prevent matches with the
252 * string of window index 0 (in particular we have to
253 * avoid a match of the string with itself at the start
254 * of the input file). */
255 match_len = longest_match(uncompressed_data,
256 uncompressed_len - cur_input_pos,
257 cur_input_pos, prev_tab,
259 &match_start, params);
261 if (match_len == params->min_match &&
262 cur_input_pos - match_start > params->too_far)
264 match_len = params->min_match - 1;
268 /* If there was a match at the previous step and the current
269 * match is not better, output the previous match:
271 if (prev_len >= params->min_match && match_len <= prev_len) {
273 /* Do not insert strings in hash table beyond this. */
274 uint max_insert = uncompressed_len - params->min_match;
276 /*DEBUG("Recording match (pos = %u, offset = %u, len = %u)\n",*/
277 /*cur_input_pos - 1, */
278 /*cur_input_pos - 1 - prev_start,*/
281 match = (*record_match)(cur_input_pos - 1 - prev_start,
286 match_tab[cur_match_pos++] = match;
288 /* Insert in hash table all strings up to the end of the match.
289 * strstart-1 and strstart are already inserted. If there is not
290 * enough lookahead, the last two strings are not inserted in
296 if (++cur_input_pos <= max_insert) {
297 hash = insert_string(hash_tab, prev_tab,
302 } while (--prev_len != 0);
304 match_available = false;
305 match_len = params->min_match - 1;
306 } else if (match_available) {
307 /* If there was no match at the previous position, output a
308 * single literal. If there was a match but the current match
309 * is longer, truncate the previous match to a single literal.
312 /*DEBUG("Recording litrl (pos = %u, value = %u)\n",*/
313 /*cur_input_pos - 1, */
314 /*uncompressed_data[cur_input_pos - 1]);*/
316 match = (*record_literal)(
317 uncompressed_data[cur_input_pos - 1],
319 match_tab[cur_match_pos++] = match;
321 /* There is no previous match to compare with, wait for
322 * the next step to decide. */
323 match_available = true;
325 } while (++cur_input_pos < uncompressed_len);
327 if (match_available) {
328 match = (*record_literal)(uncompressed_data[cur_input_pos - 1],
330 match_tab[cur_match_pos++] = match;
332 return cur_match_pos;