X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Flz77.c;h=b5495da74d1599cd59e12cc8ef65c495550eba55;hb=4f433755e8f9ef79dbb4699430d047f74e338e82;hp=924663d0cb634b29c1a60d61f202dc16dc1e6fbf;hpb=10a87017a0a82d34ed3981e1f5e586b5b8613e3f;p=wimlib

diff --git a/src/lz77.c b/src/lz77.c
index 924663d0..b5495da7 100644
--- a/src/lz77.c
+++ b/src/lz77.c
@@ -1,5 +1,5 @@
 /*
- * lz.c
+ * lz77.c
  *
  * This file provides the code to analyze a buffer of uncompressed data for
  * matches, as per the LZ77 algorithm.  It uses a hash table to accelerate the
@@ -7,7 +7,7 @@
  */
 
 /*
- * Copyright (C) 2012, 2013 Biggers
+ * Copyright (C) 2012, 2013 Eric Biggers
  * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
  *
  * This file is part of wimlib, a library for working with WIM files.
@@ -26,22 +26,19 @@
  * along with wimlib; if not, see http://www.gnu.org/licenses/.
  */
 
-#include "compress.h"
-#include <string.h>
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
 
-#define LZ_MIN_MATCH 3
+#include "wimlib/compress_common.h"
+#include "wimlib/util.h"
+
+#include <string.h>
 
 #define HASH_BITS	15
 #define HASH_SIZE	(1 << HASH_BITS)
 #define HASH_MASK	(HASH_SIZE - 1)
-
-#if LZ_MIN_MATCH == 2
-#	define HASH_SHIFT	8
-#elif LZ_MIN_MATCH == 3
-#	define HASH_SHIFT	5
-#else
-#error "Invalid LZ_MIN_MATCH"
-#endif
+#define HASH_SHIFT	5
 
 /* Hash function, based on code from zlib.  This function will update and return
  * the hash value @hash for the string ending on the additional input character
@@ -52,7 +49,8 @@
  * The AND operation guarantees that only 3 characters will affect the hash
  * value, so every identical 3-character string will have the same hash value.
  */
-static inline unsigned update_hash(unsigned hash, u8 c)
+static inline unsigned
+update_hash(unsigned hash, u8 c)
 {
 	return ((hash << HASH_SHIFT) ^ c) & HASH_MASK;
 }
@@ -70,11 +68,12 @@ static inline unsigned update_hash(unsigned hash, u8 c)
  * to walk through the hash chain, until the special index `0' is reached,
  * indicating the end of the hash chain.
  */
-static inline unsigned insert_string(u16 hash_tab[], u16 prev_tab[],
-				     const u8 window[], unsigned str_pos,
-				     unsigned hash)
+static inline unsigned
+insert_string(input_idx_t hash_tab[], input_idx_t prev_tab[],
+	      const u8 window[], unsigned str_pos,
+	      unsigned hash)
 {
-	hash = update_hash(hash, window[str_pos + LZ_MIN_MATCH - 1]);
+	hash = update_hash(hash, window[str_pos + 2]);
 	prev_tab[str_pos] = hash_tab[hash];
 	hash_tab[hash] = str_pos;
 	return hash;
@@ -87,26 +86,31 @@ static inline unsigned insert_string(u16 hash_tab[], u16 prev_tab[],
  * @window:		The window of uncompressed data.
  * @bytes_remaining:	The number of bytes remaining in the window.
  * @strstart:		The index of the start of the string in the window that
- * 				we are trying to find a match for.
+ *				we are trying to find a match for.
  * @prev_tab:		The array of prev pointers for the hash table.
  * @cur_match:		The index of the head of the hash chain for matches
- * 				having the hash value of the string beginning
- * 				at index @strstart.
+ *				having the hash value of the string beginning
+ *				at index @strstart.
  * @prev_len:		The length of the match that was found for the string
- * 				beginning at (@strstart - 1).
+ *				beginning at (@strstart - 1).
  * @match_start_ret:	A location into which the index of the start of the
- * 				match will be returned.
+ *				match will be returned.
  * @params:		Parameters that affect how long the search will proceed
- * 				before going with the best that has been found
- * 				so far.
+ *				before going with the best that has been found
+ *				so far.
+ * @min_start_pos:	If the chain reaches a match starting before this
+ *			position (including the end-of-chain 0), the search will
+ *			be terminated.
  *
  * Returns the length of the match that was found.
  */
-static unsigned longest_match(const u8 window[], unsigned bytes_remaining,
-			      unsigned strstart, const u16 prev_tab[],
-			      unsigned cur_match, unsigned prev_len,
-			      unsigned *match_start_ret,
-			      const struct lz_params *params)
+static unsigned
+longest_match(const u8 window[], unsigned bytes_remaining,
+	      unsigned strstart, const input_idx_t prev_tab[],
+	      unsigned cur_match, unsigned prev_len,
+	      unsigned *match_start_ret,
+	      const struct lz_params *params,
+	      unsigned min_start_pos)
 {
 	unsigned chain_len = params->max_chain_len;
 
@@ -137,9 +141,8 @@ static unsigned longest_match(const u8 window[], unsigned bytes_remaining,
 		 * performance reasons.  Therefore uninitialized memory will be
 		 * accessed, and conditional jumps will be made that depend on
 		 * those values.  However the length of the match is limited to
-		 * the lookahead, so the output of deflate is not affected by
-		 * the uninitialized values.
-		 */
+		 * the lookahead, so the output of lz_analyze_block() is not
+		 * affected by the uninitialized values.  */
 
 		if (match[best_len] != scan_end
 		    || match[best_len - 1] != scan_end1
@@ -173,7 +176,7 @@ static unsigned longest_match(const u8 window[], unsigned bytes_remaining,
 			scan_end1  = scan[best_len - 1];
 			scan_end   = scan[best_len];
 		}
-	} while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) != 0);
+	} while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) >= min_start_pos);
 	*match_start_ret = match_start;
 	return min(min(best_len, bytes_remaining), params->max_match);
 }
@@ -184,43 +187,25 @@ static unsigned longest_match(const u8 window[], unsigned bytes_remaining,
  * Determines the sequence of matches and literals that a block of data will be
  * compressed to.
  *
- * @uncompressed_data:	The data that is to be compressed.
- * @uncompressed_len:	The length of @uncompressed_data, in bytes.
- * @match_tab:		An array for the intermediate representation of matches.
- * @record_match:	A function that will be called to produce the
- * 				intermediate representation of a match, given
- * 				the offset and length.  This function should also
- * 				update the appropriate symbol frequency counts
- * 				so that any needed Huffman codes can be made
- * 				later.
- * @record_literal:	A function that will be called to produce the
- * 				intermediate representation of a literal, given
- * 				the character of the literal.  This function
- * 				should also update the appropriate symbol
- * 				frequency counts so that any needed Huffman
- * 				codes can be made later.
- * @record_match_arg_1:
- * @record_match_arg_2:	Extra arguments to be passed to @record_match.
- * @record_literal_arg:	Extra arguments to be passed to @record_literal.
+ * @window:		The data that is to be compressed.
+ * @window_size:	The length of @window, in bytes.
+ * @record_match:	Consumer for matches.
+ * @record_literal:	Consumer for literals.
+ * @record_ctx:		Context passed to @record_match and @record_literal.
  * @params:		Structure that contains parameters that affect how the
- * 				analysis proceeds (mainly how good the matches
- * 				have to be).
- *
- * Returns the total number of matches and literal bytes that were found; this
- * is the number of slots in @match_tab that have been filled with the
- * intermediate representation of a match or literal byte.
+ *				analysis proceeds (mainly how good the matches
+ *				have to be).
+ * @prev_tab:		Temporary space containing least @window_size elements.
  */
-unsigned lz_analyze_block(const u8 uncompressed_data[],
-			  unsigned uncompressed_len,
-			  u32 match_tab[],
-			  lz_record_match_t record_match,
-			  lz_record_literal_t record_literal,
-			  void *record_match_arg1,
-			  void *record_match_arg2,
-			  void *record_literal_arg,
-			  const struct lz_params *params)
+void
+lz_analyze_block(const u8 window[restrict],
+		 input_idx_t window_size,
+		 lz_record_match_t record_match,
+		 lz_record_literal_t record_literal,
+		 void *record_ctx,
+		 const struct lz_params *params,
+		 input_idx_t prev_tab[restrict])
 {
-	unsigned cur_match_pos = 0;
 	unsigned cur_input_pos = 0;
 	unsigned hash          = 0;
 	unsigned hash_head     = 0;
@@ -229,23 +214,21 @@ unsigned lz_analyze_block(const u8 uncompressed_data[],
 	unsigned match_len     = params->min_match - 1;
 	unsigned match_start   = 0;
 	bool match_available = false;
-	u16 hash_tab[HASH_SIZE];
-	u32 match;
-	u16 prev_tab[uncompressed_len];
+	input_idx_t hash_tab[HASH_SIZE];
+	unsigned min_start_pos = 1;
 
 	ZERO_ARRAY(hash_tab);
-	ZERO_ARRAY(prev_tab);
 
 	do {
 		/* If there are at least 3 characters remaining in the input,
 		 * insert the 3-character string beginning at
-		 * uncompressed_data[cur_input_pos] into the hash table.
+		 * window[cur_input_pos] into the hash table.
 		 *
 		 * hash_head is set to the index of the previous string in the
 		 * hash bucket, or 0 if there is no such string */
-		if (uncompressed_len - cur_input_pos >= params->min_match) {
+		if (window_size - cur_input_pos >= params->min_match) {
 			hash = insert_string(hash_tab, prev_tab,
-					     uncompressed_data,
+					     window,
 					     cur_input_pos, hash);
 			hash_head = prev_tab[cur_input_pos];
 		} else {
@@ -258,16 +241,24 @@ unsigned lz_analyze_block(const u8 uncompressed_data[],
 		prev_start = match_start;
 		match_len = params->min_match - 1;
 
-		if (hash_head != 0 && prev_len < params->max_lazy_match) {
+		if (cur_input_pos > params->max_offset)
+			min_start_pos = cur_input_pos - params->max_offset;
+		else
+			min_start_pos = 1;
+
+		if (hash_head >= min_start_pos &&
+		    prev_len < params->max_lazy_match)
+		{
 			/* To simplify the code, we prevent matches with the
 			 * string of window index 0 (in particular we have to
 			 * avoid a match of the string with itself at the start
 			 * of the input file).  */
-			match_len = longest_match(uncompressed_data,
-						  uncompressed_len - cur_input_pos,
+			match_len = longest_match(window,
+						  window_size - cur_input_pos,
 						  cur_input_pos, prev_tab,
 						  hash_head, prev_len,
-						  &match_start, params);
+						  &match_start, params,
+						  min_start_pos);
 
 			if (match_len == params->min_match &&
 			     cur_input_pos - match_start > params->too_far)
@@ -279,68 +270,44 @@ unsigned lz_analyze_block(const u8 uncompressed_data[],
 		 */
 		if (prev_len >= params->min_match && match_len <= prev_len) {
 
+
+			(*record_match)(prev_len,
+					cur_input_pos - 1 - prev_start,
+					record_ctx);
+
+			/* Insert in hash table all strings up to the end of the
+			 * match.  strstart-1 and strstart are already inserted.
+			 * If there is not enough lookahead, the last two
+			 * strings are not inserted in the hash table.  */
+
 			/* Do not insert strings in hash table beyond this. */
-			unsigned max_insert = uncompressed_len - params->min_match;
-
-			/*DEBUG("Recording match (pos = %u, offset = %u, len = %u)\n",*/
-					/*cur_input_pos - 1, */
-					/*cur_input_pos - 1 - prev_start,*/
-					/*prev_len);*/
-
-			match = (*record_match)(cur_input_pos - 1 - prev_start,
-						prev_len,
-						record_match_arg1,
-						record_match_arg2);
-
-			match_tab[cur_match_pos++] = match;
-
-			/* Insert in hash table all strings up to the end of the match.
-			 * strstart-1 and strstart are already inserted. If there is not
-			 * enough lookahead, the last two strings are not inserted in
-			 * the hash table.
-			 */
-#if LZ_MIN_MATCH == 2
-			if (prev_len >= 3)
-#endif
-			{
-				prev_len -= 2;
-
-				do {
-					if (++cur_input_pos <= max_insert) {
-						hash = insert_string(hash_tab, prev_tab,
-								     uncompressed_data,
-								     cur_input_pos,
-								     hash);
-					}
-				} while (--prev_len != 0);
-			}
+			unsigned max_insert = window_size - params->min_match;
+
+			prev_len -= 2;
+
+			do {
+				if (++cur_input_pos <= max_insert) {
+					hash = insert_string(hash_tab, prev_tab,
+							     window,
+							     cur_input_pos,
+							     hash);
+				}
+			} while (--prev_len != 0);
 			match_available = false;
 			match_len = params->min_match - 1;
 		} else if (match_available) {
-			/* If there was no match at the previous position, output a
-			 * single literal. If there was a match but the current match
-			 * is longer, truncate the previous match to a single literal.
-			 */
-
-			/*DEBUG("Recording litrl (pos = %u, value = %u)\n",*/
-					/*cur_input_pos - 1, */
-					/*uncompressed_data[cur_input_pos - 1]);*/
-
-			match = (*record_literal)(
-					uncompressed_data[cur_input_pos - 1],
-							record_literal_arg);
-			match_tab[cur_match_pos++] = match;
+			/* If there was no match at the previous position,
+			 * output a single literal. If there was a match but the
+			 * current match is longer, truncate the previous match
+			 * to a single literal.  */
+			(*record_literal)(window[cur_input_pos - 1], record_ctx);
 		} else {
 			/* There is no previous match to compare with, wait for
 			 * the next step to decide.  */
 			match_available = true;
 		}
-	} while (++cur_input_pos < uncompressed_len);
-
-	if (match_available) {
-		match = (*record_literal)(uncompressed_data[cur_input_pos - 1],
-						record_literal_arg);
-		match_tab[cur_match_pos++] = match;
-	}
-	return cur_match_pos;
+	} while (++cur_input_pos < window_size);
+
+	if (match_available)
+		(*record_literal)(window[cur_input_pos - 1], record_ctx);
 }