write_stream_uncompressed(): Do not issue unnecessary hard error

[wimlib] / src / lz77.c
diff --git a/src/lz77.c b/src/lz77.c

index 36166efe659c3a59fe082fbefb24eaddb33207b6..b5495da74d1599cd59e12cc8ef65c495550eba55 100644 (file)
--- a/src/lz77.c
+++ b/src/lz77.c
@@ -1,5 +1,5 @@
  /*
- * lz.c
+ * lz77.c
   *
   * This file provides the code to analyze a buffer of uncompressed data for
   * matches, as per the LZ77 algorithm.  It uses a hash table to accelerate the
@@ -26,22 +26,19 @@
   * along with wimlib; if not, see http://www.gnu.org/licenses/.
   */
  
-#include "compress.h"
-#include <string.h>
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
  
-#define LZ_MIN_MATCH 3
+#include "wimlib/compress_common.h"
+#include "wimlib/util.h"
+
+#include <string.h>
  
  #define HASH_BITS      15
  #define HASH_SIZE      (1 << HASH_BITS)
  #define HASH_MASK      (HASH_SIZE - 1)
-
-#if LZ_MIN_MATCH == 2
-#      define HASH_SHIFT       8
-#elif LZ_MIN_MATCH == 3
-#      define HASH_SHIFT       5
-#else
-#error "Invalid LZ_MIN_MATCH"
-#endif
+#define HASH_SHIFT     5
  
  /* Hash function, based on code from zlib.  This function will update and return
   * the hash value @hash for the string ending on the additional input character
@@ -52,7 +49,8 @@
   * The AND operation guarantees that only 3 characters will affect the hash
   * value, so every identical 3-character string will have the same hash value.
   */
-static inline unsigned update_hash(unsigned hash, u8 c)
+static inline unsigned
+update_hash(unsigned hash, u8 c)
  {
         return ((hash << HASH_SHIFT) ^ c) & HASH_MASK;
  }
@@ -70,11 +68,12 @@ static inline unsigned update_hash(unsigned hash, u8 c)
   * to walk through the hash chain, until the special index `0' is reached,
   * indicating the end of the hash chain.
   */
-static inline unsigned insert_string(u16 hash_tab[], u16 prev_tab[],
-                                    const u8 window[], unsigned str_pos,
-                                    unsigned hash)
+static inline unsigned
+insert_string(input_idx_t hash_tab[], input_idx_t prev_tab[],
+             const u8 window[], unsigned str_pos,
+             unsigned hash)
  {
-       hash = update_hash(hash, window[str_pos + LZ_MIN_MATCH - 1]);
+       hash = update_hash(hash, window[str_pos + 2]);
         prev_tab[str_pos] = hash_tab[hash];
         hash_tab[hash] = str_pos;
         return hash;
@@ -87,26 +86,31 @@ static inline unsigned insert_string(u16 hash_tab[], u16 prev_tab[],
   * @window:            The window of uncompressed data.
   * @bytes_remaining:   The number of bytes remaining in the window.
   * @strstart:          The index of the start of the string in the window that
- *                             we are trying to find a match for.
+ *                             we are trying to find a match for.
   * @prev_tab:          The array of prev pointers for the hash table.
   * @cur_match:         The index of the head of the hash chain for matches
- *                             having the hash value of the string beginning
- *                             at index @strstart.
+ *                             having the hash value of the string beginning
+ *                             at index @strstart.
   * @prev_len:          The length of the match that was found for the string
- *                             beginning at (@strstart - 1).
+ *                             beginning at (@strstart - 1).
   * @match_start_ret:   A location into which the index of the start of the
- *                             match will be returned.
+ *                             match will be returned.
   * @params:            Parameters that affect how long the search will proceed
- *                             before going with the best that has been found
- *                             so far.
+ *                             before going with the best that has been found
+ *                             so far.
+ * @min_start_pos:     If the chain reaches a match starting before this
+ *                     position (including the end-of-chain 0), the search will
+ *                     be terminated.
   *
   * Returns the length of the match that was found.
   */
-static unsigned longest_match(const u8 window[], unsigned bytes_remaining,
-                             unsigned strstart, const u16 prev_tab[],
-                             unsigned cur_match, unsigned prev_len,
-                             unsigned *match_start_ret,
-                             const struct lz_params *params)
+static unsigned
+longest_match(const u8 window[], unsigned bytes_remaining,
+             unsigned strstart, const input_idx_t prev_tab[],
+             unsigned cur_match, unsigned prev_len,
+             unsigned *match_start_ret,
+             const struct lz_params *params,
+             unsigned min_start_pos)
  {
         unsigned chain_len = params->max_chain_len;
  
@@ -137,9 +141,8 @@ static unsigned longest_match(const u8 window[], unsigned bytes_remaining,
                  * performance reasons.  Therefore uninitialized memory will be
                  * accessed, and conditional jumps will be made that depend on
                  * those values.  However the length of the match is limited to
-                * the lookahead, so the output of deflate is not affected by
-                * the uninitialized values.
-                */
+                * the lookahead, so the output of lz_analyze_block() is not
+                * affected by the uninitialized values.  */
  
                 if (match[best_len] != scan_end
                     || match[best_len - 1] != scan_end1
@@ -173,7 +176,7 @@ static unsigned longest_match(const u8 window[], unsigned bytes_remaining,
                         scan_end1  = scan[best_len - 1];
                         scan_end   = scan[best_len];
                 }
-       } while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) != 0);
+       } while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) >= min_start_pos);
         *match_start_ret = match_start;
         return min(min(best_len, bytes_remaining), params->max_match);
  }
@@ -184,43 +187,25 @@ static unsigned longest_match(const u8 window[], unsigned bytes_remaining,
   * Determines the sequence of matches and literals that a block of data will be
   * compressed to.
   *
- * @uncompressed_data: The data that is to be compressed.
- * @uncompressed_len:  The length of @uncompressed_data, in bytes.
- * @match_tab:         An array for the intermediate representation of matches.
- * @record_match:      A function that will be called to produce the
- *                             intermediate representation of a match, given
- *                             the offset and length.  This function should also
- *                             update the appropriate symbol frequency counts
- *                             so that any needed Huffman codes can be made
- *                             later.
- * @record_literal:    A function that will be called to produce the
- *                             intermediate representation of a literal, given
- *                             the character of the literal.  This function
- *                             should also update the appropriate symbol
- *                             frequency counts so that any needed Huffman
- *                             codes can be made later.
- * @record_match_arg_1:
- * @record_match_arg_2:        Extra arguments to be passed to @record_match.
- * @record_literal_arg:        Extra arguments to be passed to @record_literal.
+ * @window:            The data that is to be compressed.
+ * @window_size:       The length of @window, in bytes.
+ * @record_match:      Consumer for matches.
+ * @record_literal:    Consumer for literals.
+ * @record_ctx:                Context passed to @record_match and @record_literal.
   * @params:            Structure that contains parameters that affect how the
- *                             analysis proceeds (mainly how good the matches
- *                             have to be).
- *
- * Returns the total number of matches and literal bytes that were found; this
- * is the number of slots in @match_tab that have been filled with the
- * intermediate representation of a match or literal byte.
+ *                             analysis proceeds (mainly how good the matches
+ *                             have to be).
+ * @prev_tab:          Temporary space containing least @window_size elements.
   */
-unsigned lz_analyze_block(const u8 uncompressed_data[],
-                         unsigned uncompressed_len,
-                         u32 match_tab[],
-                         lz_record_match_t record_match,
-                         lz_record_literal_t record_literal,
-                         void *record_match_arg1,
-                         void *record_match_arg2,
-                         void *record_literal_arg,
-                         const struct lz_params *params)
+void
+lz_analyze_block(const u8 window[restrict],
+                input_idx_t window_size,
+                lz_record_match_t record_match,
+                lz_record_literal_t record_literal,
+                void *record_ctx,
+                const struct lz_params *params,
+                input_idx_t prev_tab[restrict])
  {
-       unsigned cur_match_pos = 0;
         unsigned cur_input_pos = 0;
         unsigned hash          = 0;
         unsigned hash_head     = 0;
@@ -229,23 +214,21 @@ unsigned lz_analyze_block(const u8 uncompressed_data[],
         unsigned match_len     = params->min_match - 1;
         unsigned match_start   = 0;
         bool match_available = false;
-       u16 hash_tab[HASH_SIZE];
-       u32 match;
-       u16 prev_tab[uncompressed_len];
+       input_idx_t hash_tab[HASH_SIZE];
+       unsigned min_start_pos = 1;
  
         ZERO_ARRAY(hash_tab);
-       ZERO_ARRAY(prev_tab);
  
         do {
                 /* If there are at least 3 characters remaining in the input,
                  * insert the 3-character string beginning at
-                * uncompressed_data[cur_input_pos] into the hash table.
+                * window[cur_input_pos] into the hash table.
                  *
                  * hash_head is set to the index of the previous string in the
                  * hash bucket, or 0 if there is no such string */
-               if (uncompressed_len - cur_input_pos >= params->min_match) {
+               if (window_size - cur_input_pos >= params->min_match) {
                         hash = insert_string(hash_tab, prev_tab,
-                                            uncompressed_data,
+                                            window,
                                              cur_input_pos, hash);
                         hash_head = prev_tab[cur_input_pos];
                 } else {
@@ -258,16 +241,24 @@ unsigned lz_analyze_block(const u8 uncompressed_data[],
                 prev_start = match_start;
                 match_len = params->min_match - 1;
  
-               if (hash_head != 0 && prev_len < params->max_lazy_match) {
+               if (cur_input_pos > params->max_offset)
+                       min_start_pos = cur_input_pos - params->max_offset;
+               else
+                       min_start_pos = 1;
+
+               if (hash_head >= min_start_pos &&
+                   prev_len < params->max_lazy_match)
+               {
                         /* To simplify the code, we prevent matches with the
                          * string of window index 0 (in particular we have to
                          * avoid a match of the string with itself at the start
                          * of the input file).  */
-                       match_len = longest_match(uncompressed_data,
-                                                 uncompressed_len - cur_input_pos,
+                       match_len = longest_match(window,
+                                                 window_size - cur_input_pos,
                                                   cur_input_pos, prev_tab,
                                                   hash_head, prev_len,
-                                                 &match_start, params);
+                                                 &match_start, params,
+                                                 min_start_pos);
  
                         if (match_len == params->min_match &&
                              cur_input_pos - match_start > params->too_far)
@@ -279,68 +270,44 @@ unsigned lz_analyze_block(const u8 uncompressed_data[],
                  */
                 if (prev_len >= params->min_match && match_len <= prev_len) {
  
+
+                       (*record_match)(prev_len,
+                                       cur_input_pos - 1 - prev_start,
+                                       record_ctx);
+
+                       /* Insert in hash table all strings up to the end of the
+                        * match.  strstart-1 and strstart are already inserted.
+                        * If there is not enough lookahead, the last two
+                        * strings are not inserted in the hash table.  */
+
                         /* Do not insert strings in hash table beyond this. */
-                       unsigned max_insert = uncompressed_len - params->min_match;
-
-                       /*DEBUG("Recording match (pos = %u, offset = %u, len = %u)\n",*/
-                                       /*cur_input_pos - 1, */
-                                       /*cur_input_pos - 1 - prev_start,*/
-                                       /*prev_len);*/
-
-                       match = (*record_match)(cur_input_pos - 1 - prev_start,
-                                               prev_len,
-                                               record_match_arg1,
-                                               record_match_arg2);
-
-                       match_tab[cur_match_pos++] = match;
-
-                       /* Insert in hash table all strings up to the end of the match.
-                        * strstart-1 and strstart are already inserted. If there is not
-                        * enough lookahead, the last two strings are not inserted in
-                        * the hash table.
-                        */
-#if LZ_MIN_MATCH == 2
-                       if (prev_len >= 3)
-#endif
-                       {
-                               prev_len -= 2;
-
-                               do {
-                                       if (++cur_input_pos <= max_insert) {
-                                               hash = insert_string(hash_tab, prev_tab,
-                                                                    uncompressed_data,
-                                                                    cur_input_pos,
-                                                                    hash);
-                                       }
-                               } while (--prev_len != 0);
-                       }
+                       unsigned max_insert = window_size - params->min_match;
+
+                       prev_len -= 2;
+
+                       do {
+                               if (++cur_input_pos <= max_insert) {
+                                       hash = insert_string(hash_tab, prev_tab,
+                                                            window,
+                                                            cur_input_pos,
+                                                            hash);
+                               }
+                       } while (--prev_len != 0);
                         match_available = false;
                         match_len = params->min_match - 1;
                 } else if (match_available) {
-                       /* If there was no match at the previous position, output a
-                        * single literal. If there was a match but the current match
-                        * is longer, truncate the previous match to a single literal.
-                        */
-
-                       /*DEBUG("Recording litrl (pos = %u, value = %u)\n",*/
-                                       /*cur_input_pos - 1, */
-                                       /*uncompressed_data[cur_input_pos - 1]);*/
-
-                       match = (*record_literal)(
-                                       uncompressed_data[cur_input_pos - 1],
-                                                       record_literal_arg);
-                       match_tab[cur_match_pos++] = match;
+                       /* If there was no match at the previous position,
+                        * output a single literal. If there was a match but the
+                        * current match is longer, truncate the previous match
+                        * to a single literal.  */
+                       (*record_literal)(window[cur_input_pos - 1], record_ctx);
                 } else {
                         /* There is no previous match to compare with, wait for
                          * the next step to decide.  */
                         match_available = true;
                 }
-       } while (++cur_input_pos < uncompressed_len);
-
-       if (match_available) {
-               match = (*record_literal)(uncompressed_data[cur_input_pos - 1],
-                                               record_literal_arg);
-               match_tab[cur_match_pos++] = match;
-       }
-       return cur_match_pos;
+       } while (++cur_input_pos < window_size);
+
+       if (match_available)
+               (*record_literal)(window[cur_input_pos - 1], record_ctx);
  }