Re-organize code

[wimlib] / src / lz77.c
diff --git a/src/lz77.c b/src/lz77.c

new file mode 100644 (file)

index 0000000..2926c12
--- /dev/null
+++ b/src/lz77.c
@@ -0,0 +1,341 @@
+/*
+ * lz.c
+ *
+ * This file provides the code to analyze a buffer of uncompressed data for
+ * matches, as per the LZ77 algorithm.  It uses a hash table to accelerate the
+ * process.  This is based on code from zlib (v. 1.2.5).
+ */
+
+/*
+ * Copyright (C) 2012 Eric Biggers
+ * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
+ *
+ * This file is part of wimlib, a library for working with WIM files.
+ *
+ * wimlib is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+ * A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with wimlib; if not, see http://www.gnu.org/licenses/.
+ */
+
+#include "compress.h"
+#include <string.h>
+
+#define LZ_MIN_MATCH 3
+
+#define HASH_BITS      15
+#define HASH_SIZE      (1 << HASH_BITS)
+#define HASH_MASK      (HASH_SIZE - 1)
+
+#if LZ_MIN_MATCH == 2
+#      define HASH_SHIFT       8
+#elif LZ_MIN_MATCH == 3
+#      define HASH_SHIFT       5
+#else
+#error "Invalid LZ_MIN_MATCH"
+#endif
+
+/* Hash function, based on code from zlib.  This function will update and return
+ * the hash value @hash for the string ending on the additional input character
+ * @c.  This function must be called for each consecutive character, because it
+ * uses a running hash value rather than computing it separately for each
+ * 3-character string.
+ *
+ * The AND operation guarantees that only 3 characters will affect the hash
+ * value, so every identical 3-character string will have the same hash value.
+ */
+static inline uint update_hash(uint hash, u8 c)
+{
+       return ((hash << HASH_SHIFT) ^ c) & HASH_MASK;
+}
+
+
+/* Insert a 3-character string at position @str_pos in @window and with hash
+ * code @hash into the hash table described by @hash_tab and @prev_tab.  Based
+ * on code from zlib.
+ *
+ * The hash table uses chains (linked lists) for the hash buckets, but there are
+ * no real pointers involved.  Indexing `hash_tab' by hash value gives the index
+ * within the window of the last string in the hash bucket.  To find the index
+ * of the previous string in the hash chain, the `prev_tab' array is indexed by
+ * the string index.  `prev_tab' can be indexed repeatedly by the string index
+ * to walk through the hash chain, until the special index `0' is reached,
+ * indicating the end of the hash chain.
+ */
+static inline uint insert_string(u16 hash_tab[], u16 prev_tab[],
+                                const u8 window[], uint str_pos, uint hash)
+{
+       hash = update_hash(hash, window[str_pos + LZ_MIN_MATCH - 1]);
+       prev_tab[str_pos] = hash_tab[hash];
+       hash_tab[hash] = str_pos;
+       return hash;
+}
+
+
+/*
+ * Returns the longest match for a given input position.
+ *
+ * @window:            The window of uncompressed data.
+ * @bytes_remaining:   The number of bytes remaining in the window.
+ * @strstart:          The index of the start of the string in the window that
+ *                             we are trying to find a match for.
+ * @prev_tab:          The array of prev pointers for the hash table.
+ * @cur_match:         The index of the head of the hash chain for matches
+ *                             having the hash value of the string beginning
+ *                             at index @strstart.
+ * @prev_len:          The length of the match that was found for the string
+ *                             beginning at (@strstart - 1).
+ * @match_start_ret:   A location into which the index of the start of the
+ *                             match will be returned.
+ * @params:            Parameters that affect how long the search will proceed
+ *                             before going with the best that has been found
+ *                             so far.
+ *
+ * Returns the length of the match that was found.
+ */
+static uint longest_match(const u8 window[], uint bytes_remaining,
+                         uint strstart, const u16 prev_tab[],
+                         uint cur_match, uint prev_len,
+                         uint *match_start_ret,
+                         const struct lz_params *params)
+{
+       uint chain_len = params->max_chain_len;
+
+       const u8 *scan = window + strstart;
+       const u8 *match;
+       uint len;
+       uint best_len = prev_len;
+       uint match_start = cur_match;
+
+       uint nice_match = min(params->nice_match, bytes_remaining);
+
+       const u8 *strend = scan + min(params->max_match, bytes_remaining);
+
+       u8 scan_end1 = scan[best_len - 1];
+       u8 scan_end = scan[best_len];
+
+
+       /* Do not waste too much time if we already have a good match: */
+       if (best_len >= params->good_match)
+               chain_len >>= 2;
+
+       do {
+               match = &window[cur_match];
+
+               /* Skip to next match if the match length cannot increase or if
+                * the match length is less than 2.  Note that the checks below
+                * for insufficient lookahead only occur occasionally for
+                * performance reasons.  Therefore uninitialized memory will be
+                * accessed, and conditional jumps will be made that depend on
+                * those values.  However the length of the match is limited to
+                * the lookahead, so the output of deflate is not affected by
+                * the uninitialized values.
+                */
+
+               if (match[best_len] != scan_end
+                   || match[best_len - 1] != scan_end1
+                   || *match != *scan
+                   || *++match != scan[1])
+                       continue;
+               scan++;
+
+       #if 0
+               do {
+               } while (scan < strend && *++match == *++scan);
+       #else
+
+               do {
+               } while (
+                        *++match == *++scan && *++match == *++scan &&
+                        *++match == *++scan && *++match == *++scan &&
+                        *++match == *++scan && *++match == *++scan &&
+                        *++match == *++scan && *++match == *++scan &&
+                        scan < strend);
+       #endif
+               len = match - &window[cur_match];
+
+               scan = &window[strstart];
+
+               if (len > best_len) {
+                       match_start = cur_match;
+                       best_len = len;
+                       if (len >= nice_match)
+                               break;
+                       scan_end1  = scan[best_len - 1];
+                       scan_end   = scan[best_len];
+               }
+       } while (--chain_len != 0 && (cur_match = prev_tab[cur_match]) != 0);
+       *match_start_ret = match_start;
+       return min(min(best_len, bytes_remaining), params->max_match);
+}
+
+
+
+/*
+ * Determines the sequence of matches and literals that a block of data will be
+ * compressed to.
+ *
+ * @uncompressed_data: The data that is to be compressed.
+ * @uncompressed_len:  The length of @uncompressed_data, in bytes.
+ * @match_tab:         An array for the intermediate representation of matches.
+ * @record_match:      A function that will be called to produce the
+ *                             intermediate representation of a match, given
+ *                             the offset and length.  This function should also
+ *                             update the appropriate symbol frequency counts
+ *                             so that any needed Huffman codes can be made
+ *                             later.
+ * @record_literal:    A function that will be called to produce the
+ *                             intermediate representation of a literal, given
+ *                             the character of the literal.  This function
+ *                             should also update the appropriate symbol
+ *                             frequency counts so that any needed Huffman
+ *                             codes can be made later.
+ * @record_match_arg_1:
+ * @record_match_arg_2:        Extra arguments to be passed to @record_match.
+ * @record_literal_arg:        Extra arguments to be passed to @record_literal.
+ * @params:            Structure that contains parameters that affect how the
+ *                             analysis proceeds (mainly how good the matches
+ *                             have to be).
+ *
+ * Returns the total number of matches and literal bytes that were found; this
+ * is the number of slots in @match_tab that have been filled with the
+ * intermediate representation of a match or literal byte.
+ */
+uint lz_analyze_block(const u8 uncompressed_data[], uint uncompressed_len,
+                     u32 match_tab[], lz_record_match_t record_match,
+                     lz_record_literal_t record_literal, void *record_match_arg1,
+                     void *record_match_arg2, void *record_literal_arg,
+                     const struct lz_params *params)
+{
+       uint cur_match_pos = 0;
+       uint cur_input_pos = 0;
+       uint hash          = 0;
+       uint hash_head     = 0;
+       uint prev_len      = params->min_match - 1;
+       uint prev_start;
+       uint match_len     = params->min_match - 1;
+       uint match_start   = 0;
+       bool match_available = false;
+       u16 hash_tab[HASH_SIZE];
+       u32 match;
+       u16 prev_tab[uncompressed_len];
+
+       ZERO_ARRAY(hash_tab);
+       ZERO_ARRAY(prev_tab);
+
+       do {
+               /* If there are at least 3 characters remaining in the input,
+                * insert the 3-character string beginning at
+                * uncompressed_data[cur_input_pos] into the hash table.
+                *
+                * hash_head is set to the index of the previous string in the
+                * hash bucket, or 0 if there is no such string */
+               if (uncompressed_len - cur_input_pos >= params->min_match) {
+                       hash = insert_string(hash_tab, prev_tab,
+                                            uncompressed_data,
+                                            cur_input_pos, hash);
+                       hash_head = prev_tab[cur_input_pos];
+               } else {
+                       hash_head = 0;
+               }
+
+
+               /* Find the longest match, discarding those <= prev_len. */
+               prev_len = match_len;
+               prev_start = match_start;
+               match_len = params->min_match - 1;
+
+               if (hash_head != 0 && prev_len < params->max_lazy_match) {
+                       /* To simplify the code, we prevent matches with the
+                        * string of window index 0 (in particular we have to
+                        * avoid a match of the string with itself at the start
+                        * of the input file).  */
+                       match_len = longest_match(uncompressed_data,
+                                                 uncompressed_len - cur_input_pos,
+                                                 cur_input_pos, prev_tab,
+                                                 hash_head, prev_len,
+                                                 &match_start, params);
+
+                       if (match_len == params->min_match &&
+                            cur_input_pos - match_start > params->too_far)
+                               match_len = params->min_match - 1;
+               }
+
+               /* If there was a match at the previous step and the current
+                * match is not better, output the previous match:
+                */
+               if (prev_len >= params->min_match && match_len <= prev_len) {
+
+                       /* Do not insert strings in hash table beyond this. */
+                       uint max_insert = uncompressed_len - params->min_match;
+
+                       /*DEBUG("Recording match (pos = %u, offset = %u, len = %u)\n",*/
+                                       /*cur_input_pos - 1, */
+                                       /*cur_input_pos - 1 - prev_start,*/
+                                       /*prev_len);*/
+
+                       match = (*record_match)(cur_input_pos - 1 - prev_start,
+                                               prev_len,
+                                               record_match_arg1,
+                                               record_match_arg2);
+
+                       match_tab[cur_match_pos++] = match;
+
+                       /* Insert in hash table all strings up to the end of the match.
+                        * strstart-1 and strstart are already inserted. If there is not
+                        * enough lookahead, the last two strings are not inserted in
+                        * the hash table.
+                        */
+#if LZ_MIN_MATCH == 2
+                       if (prev_len >= 3)
+#endif
+                       {
+                               prev_len -= 2;
+
+                               do {
+                                       if (++cur_input_pos <= max_insert) {
+                                               hash = insert_string(hash_tab, prev_tab,
+                                                                    uncompressed_data,
+                                                                    cur_input_pos,
+                                                                    hash);
+                                       }
+                               } while (--prev_len != 0);
+                       }
+                       match_available = false;
+                       match_len = params->min_match - 1;
+               } else if (match_available) {
+                       /* If there was no match at the previous position, output a
+                        * single literal. If there was a match but the current match
+                        * is longer, truncate the previous match to a single literal.
+                        */
+
+                       /*DEBUG("Recording litrl (pos = %u, value = %u)\n",*/
+                                       /*cur_input_pos - 1, */
+                                       /*uncompressed_data[cur_input_pos - 1]);*/
+
+                       match = (*record_literal)(
+                                       uncompressed_data[cur_input_pos - 1],
+                                                       record_literal_arg);
+                       match_tab[cur_match_pos++] = match;
+               } else {
+                       /* There is no previous match to compare with, wait for
+                        * the next step to decide.  */
+                       match_available = true;
+               }
+       } while (++cur_input_pos < uncompressed_len);
+
+       if (match_available) {
+               match = (*record_literal)(uncompressed_data[cur_input_pos - 1],
+                                               record_literal_arg);
+               match_tab[cur_match_pos++] = match;
+       }
+       return cur_match_pos;
+}