/*
- * lz.c
+ * lz77.c
*
* This file provides the code to analyze a buffer of uncompressed data for
* matches, as per the LZ77 algorithm. It uses a hash table to accelerate the
* along with wimlib; if not, see http://www.gnu.org/licenses/.
*/
-#include "compress.h"
-#include <string.h>
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
-#define LZ_MIN_MATCH 3
+#include "wimlib/compress_common.h"
+#include "wimlib/util.h"
+
+#include <string.h>
#define HASH_BITS 15
#define HASH_SIZE (1 << HASH_BITS)
#define HASH_MASK (HASH_SIZE - 1)
-
-#if LZ_MIN_MATCH == 2
-# define HASH_SHIFT 8
-#elif LZ_MIN_MATCH == 3
-# define HASH_SHIFT 5
-#else
-#error "Invalid LZ_MIN_MATCH"
-#endif
+#define HASH_SHIFT 5
/* Hash function, based on code from zlib. This function will update and return
* the hash value @hash for the string ending on the additional input character
* The AND operation guarantees that only 3 characters will affect the hash
* value, so every identical 3-character string will have the same hash value.
*/
-static inline unsigned update_hash(unsigned hash, u8 c)
+static inline unsigned
+update_hash(unsigned hash, u8 c)
{
return ((hash << HASH_SHIFT) ^ c) & HASH_MASK;
}
* to walk through the hash chain, until the special index `0' is reached,
* indicating the end of the hash chain.
*/
-static inline unsigned insert_string(u16 hash_tab[], u16 prev_tab[],
- const u8 window[], unsigned str_pos,
- unsigned hash)
+static inline unsigned
+insert_string(input_idx_t hash_tab[], input_idx_t prev_tab[],
+ const u8 window[], unsigned str_pos,
+ unsigned hash)
{
- hash = update_hash(hash, window[str_pos + LZ_MIN_MATCH - 1]);
+ hash = update_hash(hash, window[str_pos + 2]);
prev_tab[str_pos] = hash_tab[hash];
hash_tab[hash] = str_pos;
return hash;
* @window: The window of uncompressed data.
* @bytes_remaining: The number of bytes remaining in the window.
* @strstart: The index of the start of the string in the window that
- * we are trying to find a match for.
+ * we are trying to find a match for.
* @prev_tab: The array of prev pointers for the hash table.
* @cur_match: The index of the head of the hash chain for matches
- * having the hash value of the string beginning
- * at index @strstart.
+ * having the hash value of the string beginning
+ * at index @strstart.
* @prev_len: The length of the match that was found for the string
- * beginning at (@strstart - 1).
+ * beginning at (@strstart - 1).
* @match_start_ret: A location into which the index of the start of the
- * match will be returned.
+ * match will be returned.
* @params: Parameters that affect how long the search will proceed
- * before going with the best that has been found
- * so far.
+ * before going with the best that has been found
+ * so far.
+ * @min_start_pos: If the chain reaches a match starting before this
+ * position (including the end-of-chain 0), the search will
+ * be terminated.
*
* Returns the length of the match that was found.
*/
-static unsigned longest_match(const u8 window[], unsigned bytes_remaining,
- unsigned strstart, const u16 prev_tab[],
- unsigned cur_match, unsigned prev_len,
- unsigned *match_start_ret,
- const struct lz_params *params)
+static unsigned
+longest_match(const u8 window[], unsigned bytes_remaining,
+ unsigned strstart, const input_idx_t prev_tab[],
+ unsigned cur_match, unsigned prev_len,
+ unsigned *match_start_ret,
+ const struct lz_params *params,
+ unsigned min_start_pos)
{
unsigned chain_len = params->max_chain_len;
* performance reasons. Therefore uninitialized memory will be
* accessed, and conditional jumps will be made that depend on
* those values. However the length of the match is limited to
- * the lookahead, so the output of deflate is not affected by
- * the uninitialized values.
- */
+ * the lookahead, so the output of lz_analyze_block() is not
+ * affected by the uninitialized values. */
if (match[best_len] != scan_end
|| match[best_len - 1] != scan_end1
scan_end1 = scan[best_len - 1];
scan_end = scan[best_len];
}
- } while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) != 0);
+ } while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) >= min_start_pos);
*match_start_ret = match_start;
return min(min(best_len, bytes_remaining), params->max_match);
}
* Determines the sequence of matches and literals that a block of data will be
* compressed to.
*
- * @uncompressed_data: The data that is to be compressed.
- * @uncompressed_len: The length of @uncompressed_data, in bytes.
- * @match_tab: An array for the intermediate representation of matches.
- * @record_match: A function that will be called to produce the
- * intermediate representation of a match, given
- * the offset and length. This function should also
- * update the appropriate symbol frequency counts
- * so that any needed Huffman codes can be made
- * later.
- * @record_literal: A function that will be called to produce the
- * intermediate representation of a literal, given
- * the character of the literal. This function
- * should also update the appropriate symbol
- * frequency counts so that any needed Huffman
- * codes can be made later.
- * @record_match_arg_1:
- * @record_match_arg_2: Extra arguments to be passed to @record_match.
- * @record_literal_arg: Extra arguments to be passed to @record_literal.
+ * @window: The data that is to be compressed.
+ * @window_size: The length of @window, in bytes.
+ * @record_match: Consumer for matches.
+ * @record_literal: Consumer for literals.
+ * @record_ctx: Context passed to @record_match and @record_literal.
* @params: Structure that contains parameters that affect how the
- * analysis proceeds (mainly how good the matches
- * have to be).
- *
- * Returns the total number of matches and literal bytes that were found; this
- * is the number of slots in @match_tab that have been filled with the
- * intermediate representation of a match or literal byte.
+ * analysis proceeds (mainly how good the matches
+ * have to be).
+ * @prev_tab: Temporary space containing least @window_size elements.
*/
-unsigned lz_analyze_block(const u8 uncompressed_data[],
- unsigned uncompressed_len,
- u32 match_tab[],
- lz_record_match_t record_match,
- lz_record_literal_t record_literal,
- void *record_match_arg1,
- void *record_match_arg2,
- void *record_literal_arg,
- const struct lz_params *params)
+void
+lz_analyze_block(const u8 window[restrict],
+ input_idx_t window_size,
+ lz_record_match_t record_match,
+ lz_record_literal_t record_literal,
+ void *record_ctx,
+ const struct lz_params *params,
+ input_idx_t prev_tab[restrict])
{
- unsigned cur_match_pos = 0;
unsigned cur_input_pos = 0;
unsigned hash = 0;
unsigned hash_head = 0;
unsigned match_len = params->min_match - 1;
unsigned match_start = 0;
bool match_available = false;
- u16 hash_tab[HASH_SIZE];
- u32 match;
- u16 prev_tab[uncompressed_len];
+ input_idx_t hash_tab[HASH_SIZE];
+ unsigned min_start_pos = 1;
ZERO_ARRAY(hash_tab);
- ZERO_ARRAY(prev_tab);
do {
/* If there are at least 3 characters remaining in the input,
* insert the 3-character string beginning at
- * uncompressed_data[cur_input_pos] into the hash table.
+ * window[cur_input_pos] into the hash table.
*
* hash_head is set to the index of the previous string in the
* hash bucket, or 0 if there is no such string */
- if (uncompressed_len - cur_input_pos >= params->min_match) {
+ if (window_size - cur_input_pos >= params->min_match) {
hash = insert_string(hash_tab, prev_tab,
- uncompressed_data,
+ window,
cur_input_pos, hash);
hash_head = prev_tab[cur_input_pos];
} else {
prev_start = match_start;
match_len = params->min_match - 1;
- if (hash_head != 0 && prev_len < params->max_lazy_match) {
+ if (cur_input_pos > params->max_offset)
+ min_start_pos = cur_input_pos - params->max_offset;
+ else
+ min_start_pos = 1;
+
+ if (hash_head >= min_start_pos &&
+ prev_len < params->max_lazy_match)
+ {
/* To simplify the code, we prevent matches with the
* string of window index 0 (in particular we have to
* avoid a match of the string with itself at the start
* of the input file). */
- match_len = longest_match(uncompressed_data,
- uncompressed_len - cur_input_pos,
+ match_len = longest_match(window,
+ window_size - cur_input_pos,
cur_input_pos, prev_tab,
hash_head, prev_len,
- &match_start, params);
+ &match_start, params,
+ min_start_pos);
if (match_len == params->min_match &&
cur_input_pos - match_start > params->too_far)
*/
if (prev_len >= params->min_match && match_len <= prev_len) {
+
+ (*record_match)(prev_len,
+ cur_input_pos - 1 - prev_start,
+ record_ctx);
+
+ /* Insert in hash table all strings up to the end of the
+ * match. strstart-1 and strstart are already inserted.
+ * If there is not enough lookahead, the last two
+ * strings are not inserted in the hash table. */
+
/* Do not insert strings in hash table beyond this. */
- unsigned max_insert = uncompressed_len - params->min_match;
-
- /*DEBUG("Recording match (pos = %u, offset = %u, len = %u)\n",*/
- /*cur_input_pos - 1, */
- /*cur_input_pos - 1 - prev_start,*/
- /*prev_len);*/
-
- match = (*record_match)(cur_input_pos - 1 - prev_start,
- prev_len,
- record_match_arg1,
- record_match_arg2);
-
- match_tab[cur_match_pos++] = match;
-
- /* Insert in hash table all strings up to the end of the match.
- * strstart-1 and strstart are already inserted. If there is not
- * enough lookahead, the last two strings are not inserted in
- * the hash table.
- */
-#if LZ_MIN_MATCH == 2
- if (prev_len >= 3)
-#endif
- {
- prev_len -= 2;
-
- do {
- if (++cur_input_pos <= max_insert) {
- hash = insert_string(hash_tab, prev_tab,
- uncompressed_data,
- cur_input_pos,
- hash);
- }
- } while (--prev_len != 0);
- }
+ unsigned max_insert = window_size - params->min_match;
+
+ prev_len -= 2;
+
+ do {
+ if (++cur_input_pos <= max_insert) {
+ hash = insert_string(hash_tab, prev_tab,
+ window,
+ cur_input_pos,
+ hash);
+ }
+ } while (--prev_len != 0);
match_available = false;
match_len = params->min_match - 1;
} else if (match_available) {
- /* If there was no match at the previous position, output a
- * single literal. If there was a match but the current match
- * is longer, truncate the previous match to a single literal.
- */
-
- /*DEBUG("Recording litrl (pos = %u, value = %u)\n",*/
- /*cur_input_pos - 1, */
- /*uncompressed_data[cur_input_pos - 1]);*/
-
- match = (*record_literal)(
- uncompressed_data[cur_input_pos - 1],
- record_literal_arg);
- match_tab[cur_match_pos++] = match;
+ /* If there was no match at the previous position,
+ * output a single literal. If there was a match but the
+ * current match is longer, truncate the previous match
+ * to a single literal. */
+ (*record_literal)(window[cur_input_pos - 1], record_ctx);
} else {
/* There is no previous match to compare with, wait for
* the next step to decide. */
match_available = true;
}
- } while (++cur_input_pos < uncompressed_len);
-
- if (match_available) {
- match = (*record_literal)(uncompressed_data[cur_input_pos - 1],
- record_literal_arg);
- match_tab[cur_match_pos++] = match;
- }
- return cur_match_pos;
+ } while (++cur_input_pos < window_size);
+
+ if (match_available)
+ (*record_literal)(window[cur_input_pos - 1], record_ctx);
}