X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Flz77.c;h=b5495da74d1599cd59e12cc8ef65c495550eba55;hb=4f433755e8f9ef79dbb4699430d047f74e338e82;hp=924663d0cb634b29c1a60d61f202dc16dc1e6fbf;hpb=10a87017a0a82d34ed3981e1f5e586b5b8613e3f;p=wimlib diff --git a/src/lz77.c b/src/lz77.c index 924663d0..b5495da7 100644 --- a/src/lz77.c +++ b/src/lz77.c @@ -1,5 +1,5 @@ /* - * lz.c + * lz77.c * * This file provides the code to analyze a buffer of uncompressed data for * matches, as per the LZ77 algorithm. It uses a hash table to accelerate the @@ -7,7 +7,7 @@ */ /* - * Copyright (C) 2012, 2013 Biggers + * Copyright (C) 2012, 2013 Eric Biggers * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler * * This file is part of wimlib, a library for working with WIM files. @@ -26,22 +26,19 @@ * along with wimlib; if not, see http://www.gnu.org/licenses/. */ -#include "compress.h" -#include +#ifdef HAVE_CONFIG_H +# include +#endif -#define LZ_MIN_MATCH 3 +#include "wimlib/compress_common.h" +#include "wimlib/util.h" + +#include #define HASH_BITS 15 #define HASH_SIZE (1 << HASH_BITS) #define HASH_MASK (HASH_SIZE - 1) - -#if LZ_MIN_MATCH == 2 -# define HASH_SHIFT 8 -#elif LZ_MIN_MATCH == 3 -# define HASH_SHIFT 5 -#else -#error "Invalid LZ_MIN_MATCH" -#endif +#define HASH_SHIFT 5 /* Hash function, based on code from zlib. This function will update and return * the hash value @hash for the string ending on the additional input character @@ -52,7 +49,8 @@ * The AND operation guarantees that only 3 characters will affect the hash * value, so every identical 3-character string will have the same hash value. */ -static inline unsigned update_hash(unsigned hash, u8 c) +static inline unsigned +update_hash(unsigned hash, u8 c) { return ((hash << HASH_SHIFT) ^ c) & HASH_MASK; } @@ -70,11 +68,12 @@ static inline unsigned update_hash(unsigned hash, u8 c) * to walk through the hash chain, until the special index `0' is reached, * indicating the end of the hash chain. */ -static inline unsigned insert_string(u16 hash_tab[], u16 prev_tab[], - const u8 window[], unsigned str_pos, - unsigned hash) +static inline unsigned +insert_string(input_idx_t hash_tab[], input_idx_t prev_tab[], + const u8 window[], unsigned str_pos, + unsigned hash) { - hash = update_hash(hash, window[str_pos + LZ_MIN_MATCH - 1]); + hash = update_hash(hash, window[str_pos + 2]); prev_tab[str_pos] = hash_tab[hash]; hash_tab[hash] = str_pos; return hash; @@ -87,26 +86,31 @@ static inline unsigned insert_string(u16 hash_tab[], u16 prev_tab[], * @window: The window of uncompressed data. * @bytes_remaining: The number of bytes remaining in the window. * @strstart: The index of the start of the string in the window that - * we are trying to find a match for. + * we are trying to find a match for. * @prev_tab: The array of prev pointers for the hash table. * @cur_match: The index of the head of the hash chain for matches - * having the hash value of the string beginning - * at index @strstart. + * having the hash value of the string beginning + * at index @strstart. * @prev_len: The length of the match that was found for the string - * beginning at (@strstart - 1). + * beginning at (@strstart - 1). * @match_start_ret: A location into which the index of the start of the - * match will be returned. + * match will be returned. * @params: Parameters that affect how long the search will proceed - * before going with the best that has been found - * so far. + * before going with the best that has been found + * so far. + * @min_start_pos: If the chain reaches a match starting before this + * position (including the end-of-chain 0), the search will + * be terminated. * * Returns the length of the match that was found. */ -static unsigned longest_match(const u8 window[], unsigned bytes_remaining, - unsigned strstart, const u16 prev_tab[], - unsigned cur_match, unsigned prev_len, - unsigned *match_start_ret, - const struct lz_params *params) +static unsigned +longest_match(const u8 window[], unsigned bytes_remaining, + unsigned strstart, const input_idx_t prev_tab[], + unsigned cur_match, unsigned prev_len, + unsigned *match_start_ret, + const struct lz_params *params, + unsigned min_start_pos) { unsigned chain_len = params->max_chain_len; @@ -137,9 +141,8 @@ static unsigned longest_match(const u8 window[], unsigned bytes_remaining, * performance reasons. Therefore uninitialized memory will be * accessed, and conditional jumps will be made that depend on * those values. However the length of the match is limited to - * the lookahead, so the output of deflate is not affected by - * the uninitialized values. - */ + * the lookahead, so the output of lz_analyze_block() is not + * affected by the uninitialized values. */ if (match[best_len] != scan_end || match[best_len - 1] != scan_end1 @@ -173,7 +176,7 @@ static unsigned longest_match(const u8 window[], unsigned bytes_remaining, scan_end1 = scan[best_len - 1]; scan_end = scan[best_len]; } - } while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) != 0); + } while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) >= min_start_pos); *match_start_ret = match_start; return min(min(best_len, bytes_remaining), params->max_match); } @@ -184,43 +187,25 @@ static unsigned longest_match(const u8 window[], unsigned bytes_remaining, * Determines the sequence of matches and literals that a block of data will be * compressed to. * - * @uncompressed_data: The data that is to be compressed. - * @uncompressed_len: The length of @uncompressed_data, in bytes. - * @match_tab: An array for the intermediate representation of matches. - * @record_match: A function that will be called to produce the - * intermediate representation of a match, given - * the offset and length. This function should also - * update the appropriate symbol frequency counts - * so that any needed Huffman codes can be made - * later. - * @record_literal: A function that will be called to produce the - * intermediate representation of a literal, given - * the character of the literal. This function - * should also update the appropriate symbol - * frequency counts so that any needed Huffman - * codes can be made later. - * @record_match_arg_1: - * @record_match_arg_2: Extra arguments to be passed to @record_match. - * @record_literal_arg: Extra arguments to be passed to @record_literal. + * @window: The data that is to be compressed. + * @window_size: The length of @window, in bytes. + * @record_match: Consumer for matches. + * @record_literal: Consumer for literals. + * @record_ctx: Context passed to @record_match and @record_literal. * @params: Structure that contains parameters that affect how the - * analysis proceeds (mainly how good the matches - * have to be). - * - * Returns the total number of matches and literal bytes that were found; this - * is the number of slots in @match_tab that have been filled with the - * intermediate representation of a match or literal byte. + * analysis proceeds (mainly how good the matches + * have to be). + * @prev_tab: Temporary space containing least @window_size elements. */ -unsigned lz_analyze_block(const u8 uncompressed_data[], - unsigned uncompressed_len, - u32 match_tab[], - lz_record_match_t record_match, - lz_record_literal_t record_literal, - void *record_match_arg1, - void *record_match_arg2, - void *record_literal_arg, - const struct lz_params *params) +void +lz_analyze_block(const u8 window[restrict], + input_idx_t window_size, + lz_record_match_t record_match, + lz_record_literal_t record_literal, + void *record_ctx, + const struct lz_params *params, + input_idx_t prev_tab[restrict]) { - unsigned cur_match_pos = 0; unsigned cur_input_pos = 0; unsigned hash = 0; unsigned hash_head = 0; @@ -229,23 +214,21 @@ unsigned lz_analyze_block(const u8 uncompressed_data[], unsigned match_len = params->min_match - 1; unsigned match_start = 0; bool match_available = false; - u16 hash_tab[HASH_SIZE]; - u32 match; - u16 prev_tab[uncompressed_len]; + input_idx_t hash_tab[HASH_SIZE]; + unsigned min_start_pos = 1; ZERO_ARRAY(hash_tab); - ZERO_ARRAY(prev_tab); do { /* If there are at least 3 characters remaining in the input, * insert the 3-character string beginning at - * uncompressed_data[cur_input_pos] into the hash table. + * window[cur_input_pos] into the hash table. * * hash_head is set to the index of the previous string in the * hash bucket, or 0 if there is no such string */ - if (uncompressed_len - cur_input_pos >= params->min_match) { + if (window_size - cur_input_pos >= params->min_match) { hash = insert_string(hash_tab, prev_tab, - uncompressed_data, + window, cur_input_pos, hash); hash_head = prev_tab[cur_input_pos]; } else { @@ -258,16 +241,24 @@ unsigned lz_analyze_block(const u8 uncompressed_data[], prev_start = match_start; match_len = params->min_match - 1; - if (hash_head != 0 && prev_len < params->max_lazy_match) { + if (cur_input_pos > params->max_offset) + min_start_pos = cur_input_pos - params->max_offset; + else + min_start_pos = 1; + + if (hash_head >= min_start_pos && + prev_len < params->max_lazy_match) + { /* To simplify the code, we prevent matches with the * string of window index 0 (in particular we have to * avoid a match of the string with itself at the start * of the input file). */ - match_len = longest_match(uncompressed_data, - uncompressed_len - cur_input_pos, + match_len = longest_match(window, + window_size - cur_input_pos, cur_input_pos, prev_tab, hash_head, prev_len, - &match_start, params); + &match_start, params, + min_start_pos); if (match_len == params->min_match && cur_input_pos - match_start > params->too_far) @@ -279,68 +270,44 @@ unsigned lz_analyze_block(const u8 uncompressed_data[], */ if (prev_len >= params->min_match && match_len <= prev_len) { + + (*record_match)(prev_len, + cur_input_pos - 1 - prev_start, + record_ctx); + + /* Insert in hash table all strings up to the end of the + * match. strstart-1 and strstart are already inserted. + * If there is not enough lookahead, the last two + * strings are not inserted in the hash table. */ + /* Do not insert strings in hash table beyond this. */ - unsigned max_insert = uncompressed_len - params->min_match; - - /*DEBUG("Recording match (pos = %u, offset = %u, len = %u)\n",*/ - /*cur_input_pos - 1, */ - /*cur_input_pos - 1 - prev_start,*/ - /*prev_len);*/ - - match = (*record_match)(cur_input_pos - 1 - prev_start, - prev_len, - record_match_arg1, - record_match_arg2); - - match_tab[cur_match_pos++] = match; - - /* Insert in hash table all strings up to the end of the match. - * strstart-1 and strstart are already inserted. If there is not - * enough lookahead, the last two strings are not inserted in - * the hash table. - */ -#if LZ_MIN_MATCH == 2 - if (prev_len >= 3) -#endif - { - prev_len -= 2; - - do { - if (++cur_input_pos <= max_insert) { - hash = insert_string(hash_tab, prev_tab, - uncompressed_data, - cur_input_pos, - hash); - } - } while (--prev_len != 0); - } + unsigned max_insert = window_size - params->min_match; + + prev_len -= 2; + + do { + if (++cur_input_pos <= max_insert) { + hash = insert_string(hash_tab, prev_tab, + window, + cur_input_pos, + hash); + } + } while (--prev_len != 0); match_available = false; match_len = params->min_match - 1; } else if (match_available) { - /* If there was no match at the previous position, output a - * single literal. If there was a match but the current match - * is longer, truncate the previous match to a single literal. - */ - - /*DEBUG("Recording litrl (pos = %u, value = %u)\n",*/ - /*cur_input_pos - 1, */ - /*uncompressed_data[cur_input_pos - 1]);*/ - - match = (*record_literal)( - uncompressed_data[cur_input_pos - 1], - record_literal_arg); - match_tab[cur_match_pos++] = match; + /* If there was no match at the previous position, + * output a single literal. If there was a match but the + * current match is longer, truncate the previous match + * to a single literal. */ + (*record_literal)(window[cur_input_pos - 1], record_ctx); } else { /* There is no previous match to compare with, wait for * the next step to decide. */ match_available = true; } - } while (++cur_input_pos < uncompressed_len); - - if (match_available) { - match = (*record_literal)(uncompressed_data[cur_input_pos - 1], - record_literal_arg); - match_tab[cur_match_pos++] = match; - } - return cur_match_pos; + } while (++cur_input_pos < window_size); + + if (match_available) + (*record_literal)(window[cur_input_pos - 1], record_ctx); }