* This file is part of wimlib, a library for working with WIM files.
*
* wimlib is free software; you can redistribute it and/or modify it under the
- * terms of the GNU Lesser General Public License as published by the Free
- * Software Foundation; either version 2.1 of the License, or (at your option)
+ * terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
- * You should have received a copy of the GNU Lesser General Public License
+ * You should have received a copy of the GNU General Public License
* along with wimlib; if not, see http://www.gnu.org/licenses/.
*/
#include "comp.h"
#include <string.h>
+#define LZ_MIN_MATCH 3
+
#define HASH_BITS 15
#define HASH_SIZE (1 << HASH_BITS)
#define HASH_MASK (HASH_SIZE - 1)
-#define HASH_SHIFT ((HASH_BITS + 2) / 3)
+
+#if LZ_MIN_MATCH == 2
+# define HASH_SHIFT 8
+#elif LZ_MIN_MATCH == 3
+# define HASH_SHIFT 5
+#else
+#error "Invalid LZ_MIN_MATCH"
+#endif
/* Hash function, based on code from zlib. This function will update and return
* the hash value @hash for the string ending on the additional input character
* to walk through the hash chain, until the special index `0' is reached,
* indicating the end of the hash chain.
*/
-static inline uint insert_string(u16 hash_tab[], u16 prev_tab[],
- const u8 window[], uint str_pos,
- uint hash)
+static inline uint insert_string(u16 hash_tab[], u16 prev_tab[],
+ const u8 window[], uint str_pos, uint hash)
{
- hash = update_hash(hash, window[str_pos + 2]);
+ hash = update_hash(hash, window[str_pos + LZ_MIN_MATCH - 1]);
prev_tab[str_pos] = hash_tab[hash];
hash_tab[hash] = str_pos;
return hash;
* @strstart: The index of the start of the string in the window that
* we are trying to find a match for.
* @prev_tab: The array of prev pointers for the hash table.
- * @cur_match: The index of the head of the hash chain for matches
- * having the hash value of the string beginning
+ * @cur_match: The index of the head of the hash chain for matches
+ * having the hash value of the string beginning
* at index @strstart.
- * @prev_len: The length of the match that was found for the string
+ * @prev_len: The length of the match that was found for the string
* beginning at (@strstart - 1).
* @match_start_ret: A location into which the index of the start of the
* match will be returned.
* Returns the length of the match that was found.
*/
static uint longest_match(const u8 window[], uint bytes_remaining,
- uint strstart, const u16 prev_tab[],
- uint cur_match, uint prev_len,
- uint *match_start_ret,
+ uint strstart, const u16 prev_tab[],
+ uint cur_match, uint prev_len,
+ uint *match_start_ret,
const struct lz_params *params)
{
uint chain_len = params->max_chain_len;
uint nice_match = min(params->nice_match, bytes_remaining);
- const u8 *strend = scan + params->max_match;
+ const u8 *strend = scan + min(params->max_match, bytes_remaining);
u8 scan_end1 = scan[best_len - 1];
u8 scan_end = scan[best_len];
do {
match = &window[cur_match];
-
- /* Skip to next match if the match length cannot increase
- * or if the match length is less than 2. Note that the checks below
- * for insufficient lookahead only occur occasionally for performance
- * reasons. Therefore uninitialized memory will be accessed, and
- * conditional jumps will be made that depend on those values.
- * However the length of the match is limited to the lookahead, so
- * the output of deflate is not affected by the uninitialized values.
+ /* Skip to next match if the match length cannot increase or if
+ * the match length is less than 2. Note that the checks below
+ * for insufficient lookahead only occur occasionally for
+ * performance reasons. Therefore uninitialized memory will be
+ * accessed, and conditional jumps will be made that depend on
+ * those values. However the length of the match is limited to
+ * the lookahead, so the output of deflate is not affected by
+ * the uninitialized values.
*/
- if (match[best_len] != scan_end ||
- match[best_len-1] != scan_end1 ||
- *match != *scan ||
- *++match != scan[1]) continue;
+ if (match[best_len] != scan_end
+ || match[best_len - 1] != scan_end1
+ || *match != *scan
+ || *++match != scan[1])
+ continue;
+ scan++;
- /* The check at best_len-1 can be removed because it will be made
- * again later. (This heuristic is not always a win.)
- * It is not necessary to compare scan[2] and match[2] since they
- * are always equal when the other bytes match, given that
- * the hash keys are equal and that HASH_BITS >= 8.
- */
- scan += 2, match++;
-
- wimlib_assert(*scan == *match);
+ #if 0
+ do {
+ } while (scan < strend && *++match == *++scan);
+ #else
- /* We check for insufficient lookahead only every 8th comparison;
- * the 256th check will be made at strstart+258. */
do {
- } while (*++scan == *++match && *++scan == *++match &&
- *++scan == *++match && *++scan == *++match &&
- *++scan == *++match && *++scan == *++match &&
- *++scan == *++match && *++scan == *++match &&
- scan < strend);
+ } while (
+ *++match == *++scan && *++match == *++scan &&
+ *++match == *++scan && *++match == *++scan &&
+ *++match == *++scan && *++match == *++scan &&
+ *++match == *++scan && *++match == *++scan &&
+ scan < strend);
+ #endif
+ len = match - &window[cur_match];
- len = params->max_match - (int)(strend - scan);
- scan = strend - params->max_match;
+ scan = &window[strstart];
if (len > best_len) {
match_start = cur_match;
best_len = len;
- if (len >= nice_match)
+ if (len >= nice_match)
break;
scan_end1 = scan[best_len - 1];
scan_end = scan[best_len];
}
- cur_match = prev_tab[cur_match];
- } while (--chain_len != 0 && cur_match != 0);
+ } while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) != 0);
*match_start_ret = match_start;
return min(min(best_len, bytes_remaining), params->max_match);
}
-/*
+/*
* Determines the sequence of matches and literals that a block of data will be
* compressed to.
*
* @record_literal: A function that will be called to produce the
* intermediate representation of a literal, given
* the character of the literal. This function
- * should also update the appropriate symbol
- * frequency counts so that any needed Huffman
+ * should also update the appropriate symbol
+ * frequency counts so that any needed Huffman
* codes can be made later.
* @record_match_arg_1:
* @record_match_arg_2: Extra arguments to be passed to @record_match.
* intermediate representation of a match or literal byte.
*/
uint lz_analyze_block(const u8 uncompressed_data[], uint uncompressed_len,
- u32 match_tab[], lz_record_match_t record_match,
- lz_record_literal_t record_literal, void *record_match_arg1,
- void *record_match_arg2, void *record_literal_arg,
- const struct lz_params *params)
+ u32 match_tab[], lz_record_match_t record_match,
+ lz_record_literal_t record_literal, void *record_match_arg1,
+ void *record_match_arg2, void *record_literal_arg,
+ const struct lz_params *params)
{
uint cur_match_pos = 0;
uint cur_input_pos = 0;
* hash_head is set to the index of the previous string in the
* hash bucket, or 0 if there is no such string */
if (uncompressed_len - cur_input_pos >= params->min_match) {
- hash = insert_string(hash_tab, prev_tab,
- uncompressed_data,
- cur_input_pos, hash);
+ hash = insert_string(hash_tab, prev_tab,
+ uncompressed_data,
+ cur_input_pos, hash);
hash_head = prev_tab[cur_input_pos];
} else {
hash_head = 0;
prev_start = match_start;
match_len = params->min_match - 1;
- if (hash_head != 0 && prev_len < params->min_match) {
+ if (hash_head != 0 && prev_len < params->max_lazy_match) {
/* To simplify the code, we prevent matches with the
* string of window index 0 (in particular we have to
* avoid a match of the string with itself at the start
* of the input file). */
- match_len = longest_match(uncompressed_data,
- uncompressed_len - cur_input_pos,
- cur_input_pos, prev_tab,
- hash_head, prev_len,
- &match_start, params);
-
- if (match_len == params->min_match &&
- cur_input_pos - match_start > params->too_far)
- {
+ match_len = longest_match(uncompressed_data,
+ uncompressed_len - cur_input_pos,
+ cur_input_pos, prev_tab,
+ hash_head, prev_len,
+ &match_start, params);
+
+ if (match_len == params->min_match &&
+ cur_input_pos - match_start > params->too_far)
match_len = params->min_match - 1;
- }
}
/* If there was a match at the previous step and the current
/*cur_input_pos - 1 - prev_start,*/
/*prev_len);*/
- match = (*record_match)(cur_input_pos - 1 - prev_start,
- prev_len,
- record_match_arg1,
- record_match_arg2);
+ match = (*record_match)(cur_input_pos - 1 - prev_start,
+ prev_len,
+ record_match_arg1,
+ record_match_arg2);
match_tab[cur_match_pos++] = match;
* enough lookahead, the last two strings are not inserted in
* the hash table.
*/
- prev_len -= 2;
-
- do {
- if (++cur_input_pos <= max_insert) {
- hash = insert_string(hash_tab, prev_tab,
- uncompressed_data,
- cur_input_pos,
- hash);
- }
- } while (--prev_len != 0);
-
+#if LZ_MIN_MATCH == 2
+ if (prev_len >= 3)
+#endif
+ {
+ prev_len -= 2;
+
+ do {
+ if (++cur_input_pos <= max_insert) {
+ hash = insert_string(hash_tab, prev_tab,
+ uncompressed_data,
+ cur_input_pos,
+ hash);
+ }
+ } while (--prev_len != 0);
+ }
match_available = false;
match_len = params->min_match - 1;
} else if (match_available) {