4 * This file provides the code to analyze a buffer of uncompressed data for
5 * matches, as per the LZ77 algorithm. It uses a hash table to accelerate the
6 * process. This is based on code from zlib (v. 1.2.5).
10 * Copyright (C) 2012 Eric Biggers
11 * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
13 * This file is part of wimlib, a library for working with WIM files.
15 * wimlib is free software; you can redistribute it and/or modify it under the
16 * terms of the GNU General Public License as published by the Free
17 * Software Foundation; either version 3 of the License, or (at your option)
20 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
21 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 * A PARTICULAR PURPOSE. See the GNU General Public License for more
25 * You should have received a copy of the GNU General Public License
26 * along with wimlib; if not, see http://www.gnu.org/licenses/.
32 #define LZ_MIN_MATCH 3
35 #define HASH_SIZE (1 << HASH_BITS)
36 #define HASH_MASK (HASH_SIZE - 1)
40 #elif LZ_MIN_MATCH == 3
43 #error "Invalid LZ_MIN_MATCH"
46 /* Hash function, based on code from zlib. This function will update and return
47 * the hash value @hash for the string ending on the additional input character
48 * @c. This function must be called for each consecutive character, because it
49 * uses a running hash value rather than computing it separately for each
52 * The AND operation guarantees that only 3 characters will affect the hash
53 * value, so every identical 3-character string will have the same hash value.
55 static inline unsigned update_hash(unsigned hash, u8 c)
57 return ((hash << HASH_SHIFT) ^ c) & HASH_MASK;
61 /* Insert a 3-character string at position @str_pos in @window and with hash
62 * code @hash into the hash table described by @hash_tab and @prev_tab. Based
65 * The hash table uses chains (linked lists) for the hash buckets, but there are
66 * no real pointers involved. Indexing `hash_tab' by hash value gives the index
67 * within the window of the last string in the hash bucket. To find the index
68 * of the previous string in the hash chain, the `prev_tab' array is indexed by
69 * the string index. `prev_tab' can be indexed repeatedly by the string index
70 * to walk through the hash chain, until the special index `0' is reached,
71 * indicating the end of the hash chain.
73 static inline unsigned insert_string(u16 hash_tab[], u16 prev_tab[],
74 const u8 window[], unsigned str_pos,
77 hash = update_hash(hash, window[str_pos + LZ_MIN_MATCH - 1]);
78 prev_tab[str_pos] = hash_tab[hash];
79 hash_tab[hash] = str_pos;
85 * Returns the longest match for a given input position.
87 * @window: The window of uncompressed data.
88 * @bytes_remaining: The number of bytes remaining in the window.
89 * @strstart: The index of the start of the string in the window that
90 * we are trying to find a match for.
91 * @prev_tab: The array of prev pointers for the hash table.
92 * @cur_match: The index of the head of the hash chain for matches
93 * having the hash value of the string beginning
95 * @prev_len: The length of the match that was found for the string
96 * beginning at (@strstart - 1).
97 * @match_start_ret: A location into which the index of the start of the
98 * match will be returned.
99 * @params: Parameters that affect how long the search will proceed
100 * before going with the best that has been found
103 * Returns the length of the match that was found.
105 static unsigned longest_match(const u8 window[], unsigned bytes_remaining,
106 unsigned strstart, const u16 prev_tab[],
107 unsigned cur_match, unsigned prev_len,
108 unsigned *match_start_ret,
109 const struct lz_params *params)
111 unsigned chain_len = params->max_chain_len;
113 const u8 *scan = window + strstart;
116 unsigned best_len = prev_len;
117 unsigned match_start = cur_match;
119 unsigned nice_match = min(params->nice_match, bytes_remaining);
121 const u8 *strend = scan + min(params->max_match, bytes_remaining);
123 u8 scan_end1 = scan[best_len - 1];
124 u8 scan_end = scan[best_len];
127 /* Do not waste too much time if we already have a good match: */
128 if (best_len >= params->good_match)
132 match = &window[cur_match];
134 /* Skip to next match if the match length cannot increase or if
135 * the match length is less than 2. Note that the checks below
136 * for insufficient lookahead only occur occasionally for
137 * performance reasons. Therefore uninitialized memory will be
138 * accessed, and conditional jumps will be made that depend on
139 * those values. However the length of the match is limited to
140 * the lookahead, so the output of deflate is not affected by
141 * the uninitialized values.
144 if (match[best_len] != scan_end
145 || match[best_len - 1] != scan_end1
147 || *++match != scan[1])
153 } while (scan < strend && *++match == *++scan);
158 *++match == *++scan && *++match == *++scan &&
159 *++match == *++scan && *++match == *++scan &&
160 *++match == *++scan && *++match == *++scan &&
161 *++match == *++scan && *++match == *++scan &&
164 len = match - &window[cur_match];
166 scan = &window[strstart];
168 if (len > best_len) {
169 match_start = cur_match;
171 if (len >= nice_match)
173 scan_end1 = scan[best_len - 1];
174 scan_end = scan[best_len];
176 } while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) != 0);
177 *match_start_ret = match_start;
178 return min(min(best_len, bytes_remaining), params->max_match);
184 * Determines the sequence of matches and literals that a block of data will be
187 * @uncompressed_data: The data that is to be compressed.
188 * @uncompressed_len: The length of @uncompressed_data, in bytes.
189 * @match_tab: An array for the intermediate representation of matches.
190 * @record_match: A function that will be called to produce the
191 * intermediate representation of a match, given
192 * the offset and length. This function should also
193 * update the appropriate symbol frequency counts
194 * so that any needed Huffman codes can be made
196 * @record_literal: A function that will be called to produce the
197 * intermediate representation of a literal, given
198 * the character of the literal. This function
199 * should also update the appropriate symbol
200 * frequency counts so that any needed Huffman
201 * codes can be made later.
202 * @record_match_arg_1:
203 * @record_match_arg_2: Extra arguments to be passed to @record_match.
204 * @record_literal_arg: Extra arguments to be passed to @record_literal.
205 * @params: Structure that contains parameters that affect how the
206 * analysis proceeds (mainly how good the matches
209 * Returns the total number of matches and literal bytes that were found; this
210 * is the number of slots in @match_tab that have been filled with the
211 * intermediate representation of a match or literal byte.
213 unsigned lz_analyze_block(const u8 uncompressed_data[],
214 unsigned uncompressed_len,
216 lz_record_match_t record_match,
217 lz_record_literal_t record_literal,
218 void *record_match_arg1,
219 void *record_match_arg2,
220 void *record_literal_arg,
221 const struct lz_params *params)
223 unsigned cur_match_pos = 0;
224 unsigned cur_input_pos = 0;
226 unsigned hash_head = 0;
227 unsigned prev_len = params->min_match - 1;
229 unsigned match_len = params->min_match - 1;
230 unsigned match_start = 0;
231 bool match_available = false;
232 u16 hash_tab[HASH_SIZE];
234 u16 prev_tab[uncompressed_len];
236 ZERO_ARRAY(hash_tab);
237 ZERO_ARRAY(prev_tab);
240 /* If there are at least 3 characters remaining in the input,
241 * insert the 3-character string beginning at
242 * uncompressed_data[cur_input_pos] into the hash table.
244 * hash_head is set to the index of the previous string in the
245 * hash bucket, or 0 if there is no such string */
246 if (uncompressed_len - cur_input_pos >= params->min_match) {
247 hash = insert_string(hash_tab, prev_tab,
249 cur_input_pos, hash);
250 hash_head = prev_tab[cur_input_pos];
256 /* Find the longest match, discarding those <= prev_len. */
257 prev_len = match_len;
258 prev_start = match_start;
259 match_len = params->min_match - 1;
261 if (hash_head != 0 && prev_len < params->max_lazy_match) {
262 /* To simplify the code, we prevent matches with the
263 * string of window index 0 (in particular we have to
264 * avoid a match of the string with itself at the start
265 * of the input file). */
266 match_len = longest_match(uncompressed_data,
267 uncompressed_len - cur_input_pos,
268 cur_input_pos, prev_tab,
270 &match_start, params);
272 if (match_len == params->min_match &&
273 cur_input_pos - match_start > params->too_far)
274 match_len = params->min_match - 1;
277 /* If there was a match at the previous step and the current
278 * match is not better, output the previous match:
280 if (prev_len >= params->min_match && match_len <= prev_len) {
282 /* Do not insert strings in hash table beyond this. */
283 unsigned max_insert = uncompressed_len - params->min_match;
285 /*DEBUG("Recording match (pos = %u, offset = %u, len = %u)\n",*/
286 /*cur_input_pos - 1, */
287 /*cur_input_pos - 1 - prev_start,*/
290 match = (*record_match)(cur_input_pos - 1 - prev_start,
295 match_tab[cur_match_pos++] = match;
297 /* Insert in hash table all strings up to the end of the match.
298 * strstart-1 and strstart are already inserted. If there is not
299 * enough lookahead, the last two strings are not inserted in
302 #if LZ_MIN_MATCH == 2
309 if (++cur_input_pos <= max_insert) {
310 hash = insert_string(hash_tab, prev_tab,
315 } while (--prev_len != 0);
317 match_available = false;
318 match_len = params->min_match - 1;
319 } else if (match_available) {
320 /* If there was no match at the previous position, output a
321 * single literal. If there was a match but the current match
322 * is longer, truncate the previous match to a single literal.
325 /*DEBUG("Recording litrl (pos = %u, value = %u)\n",*/
326 /*cur_input_pos - 1, */
327 /*uncompressed_data[cur_input_pos - 1]);*/
329 match = (*record_literal)(
330 uncompressed_data[cur_input_pos - 1],
332 match_tab[cur_match_pos++] = match;
334 /* There is no previous match to compare with, wait for
335 * the next step to decide. */
336 match_available = true;
338 } while (++cur_input_pos < uncompressed_len);
340 if (match_available) {
341 match = (*record_literal)(uncompressed_data[cur_input_pos - 1],
343 match_tab[cur_match_pos++] = match;
345 return cur_match_pos;