Faster XPRESS compression

author Eric Biggers <ebiggers3@gmail.com>

Tue, 23 Dec 2014 05:16:38 +0000 (23:16 -0600)

committer Eric Biggers <ebiggers3@gmail.com>

Tue, 23 Dec 2014 05:36:49 +0000 (23:36 -0600)
author Eric Biggers <ebiggers3@gmail.com>
Tue, 23 Dec 2014 05:16:38 +0000 (23:16 -0600)
committer Eric Biggers <ebiggers3@gmail.com>
Tue, 23 Dec 2014 05:36:49 +0000 (23:36 -0600)
diff --git a/Makefile.am b/Makefile.am

index 82cc5f61fd6cf31b1fac7099f98bd2605cd260b7..39b86f22ed0390c75888465cacfae15adaaf500c 100644 (file)
--- a/Makefile.am
+++ b/Makefile.am
@@ -98,6 +98,7 @@ libwim_la_SOURCES =           \
         include/wimlib/assert.h         \
         include/wimlib/avl_tree.h       \
         include/wimlib/bitops.h         \
         include/wimlib/assert.h         \
         include/wimlib/avl_tree.h       \
         include/wimlib/bitops.h         \
+       include/wimlib/bt_matchfinder.h \
         include/wimlib/callback.h       \
         include/wimlib/capture.h        \
         include/wimlib/case.h           \
         include/wimlib/callback.h       \
         include/wimlib/capture.h        \
         include/wimlib/case.h           \
@@ -115,6 +116,7 @@ libwim_la_SOURCES =         \
         include/wimlib/error.h          \
         include/wimlib/file_io.h        \
         include/wimlib/glob.h           \
         include/wimlib/error.h          \
         include/wimlib/file_io.h        \
         include/wimlib/glob.h           \
+       include/wimlib/hc_matchfinder.h \
         include/wimlib/header.h         \
         include/wimlib/inode.h          \
         include/wimlib/inode_table.h    \
         include/wimlib/header.h         \
         include/wimlib/inode.h          \
         include/wimlib/inode_table.h    \
@@ -131,6 +133,11 @@ libwim_la_SOURCES =                \
         include/wimlib/lzms_constants.h \
         include/wimlib/lzx.h            \
         include/wimlib/lzx_constants.h  \
         include/wimlib/lzms_constants.h \
         include/wimlib/lzx.h            \
         include/wimlib/lzx_constants.h  \
+       include/wimlib/matchfinder_avx2.h       \
+       include/wimlib/matchfinder_common.h     \
+       include/wimlib/matchfinder_nonsliding.h \
+       include/wimlib/matchfinder_sliding.h    \
+       include/wimlib/matchfinder_sse2.h       \
         include/wimlib/metadata.h       \
         include/wimlib/pathlist.h       \
         include/wimlib/paths.h          \
         include/wimlib/metadata.h       \
         include/wimlib/pathlist.h       \
         include/wimlib/paths.h          \
diff --git a/NEWS b/NEWS

index 5f6394bc61248b90b9f9ba8ecf14c60a2092f8f8..ff832a57305945dead573ad07ee69cbf0ce4c51a 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,14 +1,15 @@
  Version 1.7.4-BETA:
  Version 1.7.4-BETA:
-       Extracting files to a Windows PE in-memory filesystem no longer fails if
-       the target files do not yet exist.  (Broken since v1.7.0, but actually a
-       result of a Microsoft bug.)
-
         The Windows binary distribution no longer contain third party DLLs.
         These dependencies are instead compiled directly into libwim.dll.
  
         The Windows binary distribution no longer contain third party DLLs.
         These dependencies are instead compiled directly into libwim.dll.
  
-       More fixes for wimlib on non-x86 architectures such as ARM.
+       Added more fixes for wimlib on non-x86 architectures such as ARM.
+
+       Extracting files to a Windows PE in-memory filesystem no longer fails if
+       the target files do not yet exist.
  
  
-       Slight performance improvements in compression and decompression.
+       Made more improvements to compression and decompression.  Most notable
+       is about 10% faster XPRESS compression with a tiny improvement in
+       compression ratio.
  
         Removed the --with-imagex-progname and --enable-more-assertions
         configure options.
  
         Removed the --with-imagex-progname and --enable-more-assertions
         configure options.
diff --git a/include/wimlib/bt_matchfinder.h b/include/wimlib/bt_matchfinder.h

new file mode 100644 (file)

index 0000000..75858a3
--- /dev/null
+++ b/include/wimlib/bt_matchfinder.h
@@ -0,0 +1,228 @@
+/*
+ * bt_matchfinder.h
+ *
+ * Copyright (c) 2014 Eric Biggers.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "wimlib/lz_extend.h"
+#include "wimlib/lz_hash3.h"
+#include "wimlib/matchfinder_common.h"
+#include "wimlib/unaligned.h"
+
+#ifndef BT_MATCHFINDER_HASH_ORDER
+#  if MATCHFINDER_WINDOW_ORDER < 14
+#    define BT_MATCHFINDER_HASH_ORDER 14
+#  else
+#    define BT_MATCHFINDER_HASH_ORDER 15
+#  endif
+#endif
+
+#define BT_MATCHFINDER_HASH_LENGTH     (1UL << BT_MATCHFINDER_HASH_ORDER)
+
+#define BT_MATCHFINDER_TOTAL_LENGTH    \
+       (BT_MATCHFINDER_HASH_LENGTH + (2UL * MATCHFINDER_WINDOW_SIZE))
+
+struct bt_matchfinder {
+       union {
+               pos_t mf_data[BT_MATCHFINDER_TOTAL_LENGTH];
+               struct {
+                       pos_t hash_tab[BT_MATCHFINDER_HASH_LENGTH];
+                       pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
+               };
+       };
+} _aligned_attribute(MATCHFINDER_ALIGNMENT);
+
+static inline void
+bt_matchfinder_init(struct bt_matchfinder *mf)
+{
+       matchfinder_init(mf->hash_tab, BT_MATCHFINDER_HASH_LENGTH);
+}
+
+#if MATCHFINDER_IS_SLIDING
+static inline void
+bt_matchfinder_slide_window(struct bt_matchfinder *mf)
+{
+       matchfinder_rebase(mf->mf_data, BT_MATCHFINDER_TOTAL_LENGTH);
+}
+#endif
+
+static inline unsigned
+bt_matchfinder_get_matches(struct bt_matchfinder * const restrict mf,
+                          const u8 * const in_base,
+                          const u8 * const in_next,
+                          const unsigned min_len,
+                          const unsigned max_len,
+                          const unsigned nice_len,
+                          const unsigned max_search_depth,
+                          unsigned long *prev_hash,
+                          struct lz_match * const restrict matches)
+{
+       struct lz_match *lz_matchptr = matches;
+       unsigned depth_remaining = max_search_depth;
+       unsigned hash;
+       pos_t cur_match;
+       const u8 *matchptr;
+       unsigned best_len;
+       pos_t *pending_lt_ptr, *pending_gt_ptr;
+       unsigned best_lt_len, best_gt_len;
+       unsigned len;
+       pos_t *children;
+
+       if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES + 1))
+               return 0;
+
+       hash = *prev_hash;
+       *prev_hash = lz_hash(in_next + 1, BT_MATCHFINDER_HASH_ORDER);
+       prefetch(&mf->hash_tab[*prev_hash]);
+       cur_match = mf->hash_tab[hash];
+       mf->hash_tab[hash] = in_next - in_base;
+
+       best_len = min_len - 1;
+       pending_lt_ptr = &mf->child_tab[(in_next - in_base) << 1];
+       pending_gt_ptr = &mf->child_tab[((in_next - in_base) << 1) + 1];
+       best_lt_len = 0;
+       best_gt_len = 0;
+       for (;;) {
+               if (!matchfinder_match_in_window(cur_match,
+                                                in_base, in_next) ||
+                   !depth_remaining--)
+               {
+                       *pending_lt_ptr = MATCHFINDER_INITVAL;
+                       *pending_gt_ptr = MATCHFINDER_INITVAL;
+                       return lz_matchptr - matches;
+               }
+
+               matchptr = &in_base[cur_match];
+               len = min(best_lt_len, best_gt_len);
+
+               children = &mf->child_tab[(unsigned long)
+                               matchfinder_slot_for_match(cur_match) << 1];
+
+               if (matchptr[len] == in_next[len]) {
+
+                       len = lz_extend(in_next, matchptr, len + 1, max_len);
+
+                       if (len > best_len) {
+                               best_len = len;
+
+                               lz_matchptr->length = len;
+                               lz_matchptr->offset = in_next - matchptr;
+                               lz_matchptr++;
+
+                               if (len >= nice_len) {
+                                       *pending_lt_ptr = children[0];
+                                       *pending_gt_ptr = children[1];
+                                       return lz_matchptr - matches;
+                               }
+                       }
+               }
+
+               if (matchptr[len] < in_next[len]) {
+                       *pending_lt_ptr = cur_match;
+                       pending_lt_ptr = &children[1];
+                       cur_match = *pending_lt_ptr;
+                       best_lt_len = len;
+               } else {
+                       *pending_gt_ptr = cur_match;
+                       pending_gt_ptr = &children[0];
+                       cur_match = *pending_gt_ptr;
+                       best_gt_len = len;
+               }
+       }
+}
+
+static inline void
+bt_matchfinder_skip_position(struct bt_matchfinder * const restrict mf,
+                            const u8 * const in_base,
+                            const u8 * const in_next,
+                            const u8 * const in_end,
+                            const unsigned nice_len,
+                            const unsigned max_search_depth,
+                            unsigned long *prev_hash)
+{
+       unsigned depth_remaining = max_search_depth;
+       unsigned hash;
+       pos_t cur_match;
+       const u8 *matchptr;
+       pos_t *pending_lt_ptr, *pending_gt_ptr;
+       unsigned best_lt_len, best_gt_len;
+       unsigned len;
+       pos_t *children;
+
+       if (unlikely(in_end - in_next < LZ_HASH_REQUIRED_NBYTES + 1))
+               return;
+
+       hash = *prev_hash;
+       *prev_hash = lz_hash(in_next + 1, BT_MATCHFINDER_HASH_ORDER);
+       prefetch(&mf->hash_tab[*prev_hash]);
+       cur_match = mf->hash_tab[hash];
+       mf->hash_tab[hash] = in_next - in_base;
+
+       depth_remaining = max_search_depth;
+       pending_lt_ptr = &mf->child_tab[(in_next - in_base) << 1];
+       pending_gt_ptr = &mf->child_tab[((in_next - in_base) << 1) + 1];
+       best_lt_len = 0;
+       best_gt_len = 0;
+       for (;;) {
+               if (!matchfinder_match_in_window(cur_match,
+                                                in_base, in_next) ||
+                   !depth_remaining--)
+               {
+                       *pending_lt_ptr = MATCHFINDER_INITVAL;
+                       *pending_gt_ptr = MATCHFINDER_INITVAL;
+                       return;
+               }
+
+               matchptr = &in_base[cur_match];
+               len = min(best_lt_len, best_gt_len);
+
+               children = &mf->child_tab[(unsigned long)
+                               matchfinder_slot_for_match(cur_match) << 1];
+
+               if (matchptr[len] == in_next[len]) {
+                       len = lz_extend(in_next, matchptr, len + 1, nice_len);
+                       if (len == nice_len) {
+                               *pending_lt_ptr = children[0];
+                               *pending_gt_ptr = children[1];
+                               return;
+                       }
+               }
+
+               if (matchptr[len] < in_next[len]) {
+                       *pending_lt_ptr = cur_match;
+                       pending_lt_ptr = &children[1];
+                       cur_match = *pending_lt_ptr;
+                       best_lt_len = len;
+               } else {
+                       *pending_gt_ptr = cur_match;
+                       pending_gt_ptr = &children[0];
+                       cur_match = *pending_gt_ptr;
+                       best_gt_len = len;
+               }
+       }
+}
diff --git a/include/wimlib/hc_matchfinder.h b/include/wimlib/hc_matchfinder.h

new file mode 100644 (file)

index 0000000..d880b24
--- /dev/null
+++ b/include/wimlib/hc_matchfinder.h
@@ -0,0 +1,247 @@
+/*
+ * hc_matchfinder.h
+ *
+ * Copyright (c) 2014 Eric Biggers.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "wimlib/lz_extend.h"
+#include "wimlib/lz_hash3.h"
+#include "wimlib/matchfinder_common.h"
+#include "wimlib/unaligned.h"
+
+#ifndef HC_MATCHFINDER_HASH_ORDER
+#  if MATCHFINDER_WINDOW_ORDER < 14
+#    define HC_MATCHFINDER_HASH_ORDER 14
+#  else
+#    define HC_MATCHFINDER_HASH_ORDER 15
+#  endif
+#endif
+
+#define HC_MATCHFINDER_HASH_LENGTH     (1UL << HC_MATCHFINDER_HASH_ORDER)
+
+#define HC_MATCHFINDER_TOTAL_LENGTH    \
+       (HC_MATCHFINDER_HASH_LENGTH + MATCHFINDER_WINDOW_SIZE)
+
+struct hc_matchfinder {
+       union {
+               pos_t mf_data[HC_MATCHFINDER_TOTAL_LENGTH];
+               struct {
+                       pos_t hash_tab[HC_MATCHFINDER_HASH_LENGTH];
+                       pos_t child_tab[MATCHFINDER_WINDOW_SIZE];
+               };
+       };
+} _aligned_attribute(MATCHFINDER_ALIGNMENT);
+
+/*
+ * Call before running the first byte through the matchfinder.
+ */
+static inline void
+hc_matchfinder_init(struct hc_matchfinder *mf)
+{
+       matchfinder_init(mf->hash_tab, HC_MATCHFINDER_HASH_LENGTH);
+}
+
+#if MATCHFINDER_IS_SLIDING
+static inline void
+hc_matchfinder_slide_window(struct hc_matchfinder *mf)
+{
+       matchfinder_rebase(mf->mf_data, HC_MATCHFINDER_TOTAL_LENGTH);
+}
+#endif
+
+/*
+ * Find the longest match longer than 'best_len'.
+ *
+ * @mf
+ *     The matchfinder structure.
+ * @in_base
+ *     Pointer to the next byte in the input buffer to process _at the last
+ *     time hc_matchfinder_init() or hc_matchfinder_slide_window() was called_.
+ * @in_next
+ *     Pointer to the next byte in the input buffer to process.  This is the
+ *     pointer to the bytes being matched against.
+ * @best_len
+ *     Require a match at least this long.
+ * @max_len
+ *     Maximum match length to return.
+ * @nice_len
+ *     Stop searching if a match of at least this length is found.
+ * @max_search_depth
+ *     Limit on the number of potential matches to consider.
+ * @offset_ret
+ *     The match offset is returned here.
+ *
+ * Return the length of the match found, or 'best_len' if no match longer than
+ * 'best_len' was found.
+ */
+static inline unsigned
+hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
+                            const u8 * const in_base,
+                            const u8 * const in_next,
+                            unsigned best_len,
+                            const unsigned max_len,
+                            const unsigned nice_len,
+                            const unsigned max_search_depth,
+                            unsigned *offset_ret)
+{
+       unsigned depth_remaining = max_search_depth;
+       const u8 *best_matchptr = best_matchptr; /* uninitialized */
+       const u8 *matchptr;
+       unsigned len;
+       unsigned hash;
+       pos_t cur_match;
+       u32 first_3_bytes;
+
+       /* Insert the current sequence into the appropriate hash chain.  */
+       if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES))
+               goto out;
+       first_3_bytes = load_u24_unaligned(in_next);
+       hash = lz_hash_u24(first_3_bytes, HC_MATCHFINDER_HASH_ORDER);
+       cur_match = mf->hash_tab[hash];
+       mf->child_tab[in_next - in_base] = cur_match;
+       mf->hash_tab[hash] = in_next - in_base;
+
+       if (unlikely(best_len >= max_len))
+               goto out;
+
+       /* Search the appropriate hash chain for matches.  */
+
+       if (!(matchfinder_match_in_window(cur_match, in_base, in_next)))
+               goto out;
+
+       if (best_len < 3) {
+               for (;;) {
+                       /* No length 3 match found yet.
+                        * Check the first 3 bytes.  */
+                       matchptr = &in_base[cur_match];
+
+                       if (load_u24_unaligned(matchptr) == first_3_bytes)
+                               break;
+
+                       /* Not a match; keep trying.  */
+                       cur_match = mf->child_tab[
+                                       matchfinder_slot_for_match(cur_match)];
+                       if (!matchfinder_match_in_window(cur_match,
+                                                        in_base, in_next))
+                               goto out;
+                       if (!--depth_remaining)
+                               goto out;
+               }
+
+               /* Found a length 3 match.  */
+               best_matchptr = matchptr;
+               best_len = lz_extend(in_next, best_matchptr, 3, max_len);
+               if (best_len >= nice_len)
+                       goto out;
+               cur_match = mf->child_tab[matchfinder_slot_for_match(cur_match)];
+               if (!matchfinder_match_in_window(cur_match, in_base, in_next))
+                       goto out;
+               if (!--depth_remaining)
+                       goto out;
+       }
+
+       for (;;) {
+               for (;;) {
+                       matchptr = &in_base[cur_match];
+
+                       /* Already found a length 3 match.  Try for a longer match;
+                        * start by checking the last 2 bytes and the first 4 bytes.  */
+               #if UNALIGNED_ACCESS_IS_FAST
+                       if ((load_u32_unaligned(matchptr + best_len - 3) ==
+                            load_u32_unaligned(in_next + best_len - 3)) &&
+                           (load_u32_unaligned(matchptr) ==
+                            load_u32_unaligned(in_next)))
+               #else
+                       if (matchptr[best_len] == in_next[best_len])
+               #endif
+                               break;
+
+                       cur_match = mf->child_tab[matchfinder_slot_for_match(cur_match)];
+                       if (!matchfinder_match_in_window(cur_match, in_base, in_next))
+                               goto out;
+                       if (!--depth_remaining)
+                               goto out;
+               }
+
+       #if UNALIGNED_ACCESS_IS_FAST
+               len = 4;
+       #else
+               len = 0;
+       #endif
+               len = lz_extend(in_next, matchptr, len, max_len);
+               if (len > best_len) {
+                       best_len = len;
+                       best_matchptr = matchptr;
+                       if (best_len >= nice_len)
+                               goto out;
+               }
+               cur_match = mf->child_tab[matchfinder_slot_for_match(cur_match)];
+               if (!matchfinder_match_in_window(cur_match, in_base, in_next))
+                       goto out;
+               if (!--depth_remaining)
+                       goto out;
+       }
+out:
+       *offset_ret = in_next - best_matchptr;
+       return best_len;
+}
+
+/*
+ * Advance the match-finder, but don't search for matches.
+ *
+ * @mf
+ *     The matchfinder structure.
+ * @in_base
+ *     Pointer to the next byte in the input buffer to process _at the last
+ *     time hc_matchfinder_init() or hc_matchfinder_slide_window() was called_.
+ * @in_next
+ *     Pointer to the next byte in the input buffer to process.
+ * @in_end
+ *     Pointer to the end of the input buffer.
+ * @count
+ *     Number of bytes to skip; must be > 0.
+ */
+static inline void
+hc_matchfinder_skip_positions(struct hc_matchfinder * restrict mf,
+                             const u8 *in_base,
+                             const u8 *in_next,
+                             const u8 *in_end,
+                             unsigned count)
+{
+       unsigned hash;
+
+       if (unlikely(in_next + count >= in_end - LZ_HASH_REQUIRED_NBYTES))
+               return;
+
+       do {
+               hash = lz_hash(in_next, HC_MATCHFINDER_HASH_ORDER);
+               mf->child_tab[in_next - in_base] = mf->hash_tab[hash];
+               mf->hash_tab[hash] = in_next - in_base;
+               in_next++;
+       } while (--count);
+}
diff --git a/include/wimlib/lz_hash3.h b/include/wimlib/lz_hash3.h

index b204984b1c62f92b659639164b8357e754082792..13e56db786eae174b9cfbabd5fad82964f0700d5 100644 (file)
--- a/include/wimlib/lz_hash3.h
+++ b/include/wimlib/lz_hash3.h
@@ -56,6 +56,6 @@ lz_hash(const u8 *p, unsigned num_bits)
  
  /* Number of bytes the hash function actually requires be available, due to the
   * possibility of an unaligned load.  */
  
  /* Number of bytes the hash function actually requires be available, due to the
   * possibility of an unaligned load.  */
-#define LZ_HASH_REQUIRED_NBYTES 4
+#define LZ_HASH_REQUIRED_NBYTES (UNALIGNED_ACCESS_IS_FAST ? 4 : 3)
  
  #endif /* _WIMLIB_LZ_HASH3_H */
  
  #endif /* _WIMLIB_LZ_HASH3_H */
diff --git a/include/wimlib/matchfinder_avx2.h b/include/wimlib/matchfinder_avx2.h

new file mode 100644 (file)

index 0000000..fe98b63
--- /dev/null
+++ b/include/wimlib/matchfinder_avx2.h
@@ -0,0 +1,64 @@
+/*
+ * matchfinder_avx2.h
+ *
+ * Matchfinding routines optimized for Intel AVX2 (Advanced Vector Extensions).
+ */
+
+#include <immintrin.h>
+
+static inline bool
+matchfinder_init_avx2(pos_t *data, size_t size)
+{
+       __m256i v, *p;
+       size_t n;
+
+       if (size % sizeof(__m256i) * 4)
+               return false;
+
+       if (sizeof(pos_t) == 2)
+               v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
+       else if (sizeof(pos_t) == 4)
+               v = _mm256_set1_epi32(MATCHFINDER_INITVAL);
+       else
+               return false;
+
+       p = (__m256i *)data;
+       n = size / (sizeof(__m256i) * 4);
+       do {
+               p[0] = v;
+               p[1] = v;
+               p[2] = v;
+               p[3] = v;
+               p += 4;
+       } while (--n);
+       return true;
+}
+
+static inline bool
+matchfinder_rebase_avx2(pos_t *data, size_t size)
+{
+       __m256i v, *p;
+       size_t n;
+
+       if ((size % sizeof(__m256i) * 4 != 0))
+               return false;
+
+       if (sizeof(pos_t) == 2)
+               v = _mm256_set1_epi16((pos_t)-MATCHFINDER_WINDOW_SIZE);
+       else if (sizeof(pos_t) == 4)
+               v = _mm256_set1_epi32((pos_t)-MATCHFINDER_WINDOW_SIZE);
+       else
+               return false;
+
+       p = (__m256i *)data;
+       n = size / (sizeof(__m256i) * 4);
+       do {
+               /* PADDSW: Add Packed Signed Integers With Signed Saturation  */
+               p[0] = _mm256_adds_epi16(p[0], v);
+               p[1] = _mm256_adds_epi16(p[1], v);
+               p[2] = _mm256_adds_epi16(p[2], v);
+               p[3] = _mm256_adds_epi16(p[3], v);
+               p += 4;
+       } while (--n);
+       return true;
+}
diff --git a/include/wimlib/matchfinder_common.h b/include/wimlib/matchfinder_common.h

new file mode 100644 (file)

index 0000000..afaab9c
--- /dev/null
+++ b/include/wimlib/matchfinder_common.h
@@ -0,0 +1,188 @@
+/*
+ * matchfinder_common.h
+ *
+ * Common code for Lempel-Ziv matchfinding.
+ *
+ * Copyright (c) 2014 Eric Biggers.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "wimlib/types.h"
+
+#include <string.h>
+
+#ifndef MATCHFINDER_WINDOW_ORDER
+#  error "MATCHFINDER_WINDOW_ORDER must be defined!"
+#endif
+
+#ifndef MATCHFINDER_IS_SLIDING
+#  error "MATCHFINDER_IS_SLIDING must be defined!"
+#endif
+
+#define MATCHFINDER_WINDOW_SIZE ((size_t)1 << MATCHFINDER_WINDOW_ORDER)
+
+#if MATCHFINDER_IS_SLIDING
+#  include "matchfinder_sliding.h"
+#else
+#  include "matchfinder_nonsliding.h"
+#endif
+
+#define MATCHFINDER_ALIGNMENT 8
+
+#ifdef __AVX2__
+#  include "matchfinder_avx2.h"
+#  if MATCHFINDER_ALIGNMENT < 32
+#    undef MATCHFINDER_ALIGNMENT
+#    define MATCHFINDER_ALIGNMENT 32
+#  endif
+#endif
+
+#ifdef __SSE2__
+#  include "matchfinder_sse2.h"
+#  if MATCHFINDER_ALIGNMENT < 16
+#    undef MATCHFINDER_ALIGNMENT
+#    define MATCHFINDER_ALIGNMENT 16
+#  endif
+#endif
+
+/*
+ * Representation of a match.
+ */
+struct lz_match {
+
+       /* The number of bytes matched.  */
+       pos_t length;
+
+       /* The offset back from the current position that was matched.  */
+       pos_t offset;
+};
+
+static inline bool
+matchfinder_memset_init_okay(void)
+{
+       /* All bytes must match in order to use memset.  */
+       const pos_t v = MATCHFINDER_INITVAL;
+       if (sizeof(pos_t) == 2)
+               return (u8)v == (u8)(v >> 8);
+       if (sizeof(pos_t) == 4)
+               return (u8)v == (u8)(v >> 8) &&
+                      (u8)v == (u8)(v >> 16) &&
+                      (u8)v == (u8)(v >> 24);
+       return false;
+}
+
+/*
+ * Initialize the hash table portion of the matchfinder.
+ *
+ * Essentially, this is an optimized memset().
+ *
+ * 'data' must be aligned to a MATCHFINDER_ALIGNMENT boundary.
+ */
+static inline void
+matchfinder_init(pos_t *data, size_t num_entries)
+{
+       const size_t size = num_entries * sizeof(data[0]);
+
+#ifdef __AVX2__
+       if (matchfinder_init_avx2(data, size))
+               return;
+#endif
+
+#ifdef __SSE2__
+       if (matchfinder_init_sse2(data, size))
+               return;
+#endif
+
+       if (matchfinder_memset_init_okay()) {
+               memset(data, (u8)MATCHFINDER_INITVAL, size);
+               return;
+       }
+
+       for (size_t i = 0; i < num_entries; i++)
+               data[i] = MATCHFINDER_INITVAL;
+}
+
+#if MATCHFINDER_IS_SLIDING
+/*
+ * Slide the matchfinder by WINDOW_SIZE bytes.
+ *
+ * This must be called just after each WINDOW_SIZE bytes have been run through
+ * the matchfinder.
+ *
+ * This will subtract WINDOW_SIZE bytes from each entry in the array specified.
+ * The effect is that all entries are updated to be relative to the current
+ * position, rather than the position WINDOW_SIZE bytes prior.
+ *
+ * Underflow is detected and replaced with signed saturation.  This ensures that
+ * once the sliding window has passed over a position, that position forever
+ * remains out of bounds.
+ *
+ * The array passed in must contain all matchfinder data that is
+ * position-relative.  Concretely, this will include the hash table as well as
+ * the table of positions that is used to link together the sequences in each
+ * hash bucket.  Note that in the latter table, the links are 1-ary in the case
+ * of "hash chains", and 2-ary in the case of "binary trees".  In either case,
+ * the links need to be rebased in the same way.
+ */
+static inline void
+matchfinder_rebase(pos_t *data, size_t num_entries)
+{
+       const size_t size = num_entries * sizeof(data[0]);
+
+#ifdef __AVX2__
+       if (matchfinder_rebase_avx2(data, size))
+               return;
+#endif
+
+#ifdef __SSE2__
+       if (matchfinder_rebase_sse2(data, size))
+               return;
+#endif
+
+       if (MATCHFINDER_WINDOW_SIZE == 32768) {
+               /* Branchless version for 32768 byte windows.  If the value was
+                * already negative, clear all bits except the sign bit; this
+                * changes the value to -32768.  Otherwise, set the sign bit;
+                * this is equivalent to subtracting 32768.  */
+               for (size_t i = 0; i < num_entries; i++) {
+                       u16 v = data[i];
+                       u16 sign_bit = v & 0x8000;
+                       v &= sign_bit - ((sign_bit >> 15) ^ 1);
+                       v |= 0x8000;
+                       data[i] = v;
+               }
+               return;
+       }
+
+       for (size_t i = 0; i < num_entries; i++) {
+               if (data[i] >= 0)
+                       data[i] -= (pos_t)-MATCHFINDER_WINDOW_SIZE;
+               else
+                       data[i] = (pos_t)-MATCHFINDER_WINDOW_SIZE;
+       }
+}
+#endif /* MATCHFINDER_IS_SLIDING */
diff --git a/include/wimlib/matchfinder_nonsliding.h b/include/wimlib/matchfinder_nonsliding.h

new file mode 100644 (file)

index 0000000..e08f461
--- /dev/null
+++ b/include/wimlib/matchfinder_nonsliding.h
@@ -0,0 +1,47 @@
+/*
+ * matchfinder_nonsliding.h
+ *
+ * Definitions for nonsliding window matchfinders.
+ *
+ * "Nonsliding window" means that any prior sequence can be matched.
+ */
+
+#if MATCHFINDER_WINDOW_ORDER <= 16
+typedef u16 pos_t;
+#else
+typedef u32 pos_t;
+#endif
+
+#if MATCHFINDER_WINDOW_ORDER != 16 && MATCHFINDER_WINDOW_ORDER != 32
+
+/* Not all the bits of the position type are needed, so the sign bit can be
+ * reserved to mean "out of bounds".  */
+#define MATCHFINDER_INITVAL ((pos_t)-1)
+
+static inline bool
+matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next)
+{
+       return !(cur_match & ((pos_t)1 << (sizeof(pos_t) * 8 - 1)));
+}
+
+#else
+
+/* All bits of the position type are needed, so use 0 to mean "out of bounds".
+ * This prevents the beginning of the buffer from matching anything; however,
+ * this doesn't matter much.  */
+
+#define MATCHFINDER_INITVAL ((pos_t)0)
+
+static inline bool
+matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next)
+{
+       return cur_match != 0;
+}
+
+#endif
+
+static inline pos_t
+matchfinder_slot_for_match(pos_t cur_match)
+{
+       return cur_match;
+}
diff --git a/include/wimlib/matchfinder_sliding.h b/include/wimlib/matchfinder_sliding.h

new file mode 100644 (file)

index 0000000..4b8a515
--- /dev/null
+++ b/include/wimlib/matchfinder_sliding.h
@@ -0,0 +1,30 @@
+/*
+ * matchfinder_sliding.h
+ *
+ * Definitions for sliding window matchfinders.
+ *
+ * "Sliding window" means that only sequences beginning in the most recent
+ * MATCHFINDER_WINDOW_SIZE bytes can be matched.
+ */
+
+#if MATCHFINDER_WINDOW_ORDER <= 15
+typedef s16 pos_t;
+#else
+typedef s32 pos_t;
+#endif
+
+#define MATCHFINDER_INITVAL ((pos_t)-MATCHFINDER_WINDOW_SIZE)
+
+/* In the sliding window case, positions are stored relative to 'in_base'.  */
+
+static inline bool
+matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next)
+{
+       return cur_match > (pos_t)((in_next - in_base) - MATCHFINDER_WINDOW_SIZE);
+}
+
+static inline pos_t
+matchfinder_slot_for_match(pos_t cur_match)
+{
+       return cur_match & (MATCHFINDER_WINDOW_SIZE - 1);
+}
diff --git a/include/wimlib/matchfinder_sse2.h b/include/wimlib/matchfinder_sse2.h

new file mode 100644 (file)

index 0000000..cc27600
--- /dev/null
+++ b/include/wimlib/matchfinder_sse2.h
@@ -0,0 +1,64 @@
+/*
+ * matchfinder_sse2.h
+ *
+ * Matchfinding routines optimized for Intel SSE2 (Streaming SIMD Extensions).
+ */
+
+#include <emmintrin.h>
+
+static inline bool
+matchfinder_init_sse2(pos_t *data, size_t size)
+{
+       __m128i v, *p;
+       size_t n;
+
+       if (size % sizeof(__m128i) * 4)
+               return false;
+
+       if (sizeof(pos_t) == 2)
+               v = _mm_set1_epi16(MATCHFINDER_INITVAL);
+       else if (sizeof(pos_t) == 4)
+               v = _mm_set1_epi32(MATCHFINDER_INITVAL);
+       else
+               return false;
+
+       p = (__m128i *)data;
+       n = size / (sizeof(__m128i) * 4);
+       do {
+               p[0] = v;
+               p[1] = v;
+               p[2] = v;
+               p[3] = v;
+               p += 4;
+       } while (--n);
+       return true;
+}
+
+static inline bool
+matchfinder_rebase_sse2(pos_t *data, size_t size)
+{
+       __m128i v, *p;
+       size_t n;
+
+       if ((size % sizeof(__m128i) * 4 != 0))
+               return false;
+
+       if (sizeof(pos_t) == 2)
+               v = _mm_set1_epi16((pos_t)-MATCHFINDER_WINDOW_SIZE);
+       else if (sizeof(pos_t) == 4)
+               v = _mm_set1_epi32((pos_t)-MATCHFINDER_WINDOW_SIZE);
+       else
+               return false;
+
+       p = (__m128i *)data;
+       n = size / (sizeof(__m128i) * 4);
+       do {
+               /* PADDSW: Add Packed Signed Integers With Signed Saturation  */
+               p[0] = _mm_adds_epi16(p[0], v);
+               p[1] = _mm_adds_epi16(p[1], v);
+               p[2] = _mm_adds_epi16(p[2], v);
+               p[3] = _mm_adds_epi16(p[3], v);
+               p += 4;
+       } while (--n);
+       return true;
+}
diff --git a/src/xpress-compress.c b/src/xpress-compress.c

index fdcc2fca49e61a10b935e4ed3decf65479da9514..73d831ee98e55bd582e6d36673539416b9951c4d 100644 (file)
--- a/src/xpress-compress.c
+++ b/src/xpress-compress.c
@@ -1,8 +1,7 @@
  /*
   * xpress-compress.c
   *
  /*
   * xpress-compress.c
   *
- * A compressor that produces output compatible with the XPRESS (Huffman
- * version) compression format.
+ * A compressor for the XPRESS compression format (Huffman variant).
   */
  
  /*
   */
  
  /*
@@ -26,123 +25,181 @@
  #  include "config.h"
  #endif
  
  #  include "config.h"
  #endif
  
+/*
+ * The maximum buffer size, in bytes, that can be compressed.  An XPRESS
+ * compressor instance must be created with a 'max_bufsize' less than or equal
+ * to this value.
+ */
+#define XPRESS_MAX_BUFSIZE             65536
+
+/*
+ * Define to 1 to enable the near-optimal parsing algorithm at high compression
+ * levels.  The near-optimal parsing algorithm produces a compression ratio
+ * significantly better than the greedy and lazy algorithms.  However, it is
+ * much slower.
+ */
+#define SUPPORT_NEAR_OPTIMAL_PARSING   1
+
+/*
+ * The lowest compression level at which near-optimal parsing is enabled.
+ */
+#define MIN_LEVEL_FOR_NEAR_OPTIMAL     60
+
+/*
+ * The window order for the matchfinder.  This must be the base 2 logarithm of
+ * the maximum buffer size.
+ */
+#define MATCHFINDER_WINDOW_ORDER       16
+
+/*
+ * Although XPRESS can potentially use a sliding window, it isn't well suited
+ * for large buffers of data because there is no way to reset the Huffman code.
+ * Therefore, we only allow buffers in which there is no restriction on match
+ * offsets (no sliding window).  This simplifies the code and allows some
+ * optimizations.
+ */
+#define MATCHFINDER_IS_SLIDING         0
+
+#include <string.h>
+
  #include "wimlib/bitops.h"
  #include "wimlib/compress_common.h"
  #include "wimlib/compressor_ops.h"
  #include "wimlib/endianness.h"
  #include "wimlib/error.h"
  #include "wimlib/bitops.h"
  #include "wimlib/compress_common.h"
  #include "wimlib/compressor_ops.h"
  #include "wimlib/endianness.h"
  #include "wimlib/error.h"
-#include "wimlib/lz_mf.h"
+#include "wimlib/hc_matchfinder.h"
  #include "wimlib/unaligned.h"
  #include "wimlib/util.h"
  #include "wimlib/xpress.h"
  
  #include "wimlib/unaligned.h"
  #include "wimlib/util.h"
  #include "wimlib/xpress.h"
  
-#include <string.h>
-#include <limits.h>
-
-#define XPRESS_CACHE_PER_POS           8
-#define XPRESS_OPTIM_ARRAY_LENGTH      4096
+#if SUPPORT_NEAR_OPTIMAL_PARSING
  
  
-struct xpress_compressor;
-struct xpress_item;
-struct xpress_mc_pos_data;
+/*
+ * CACHE_RESERVE_PER_POS is the number of lz_match structures to reserve in the
+ * match cache for each byte position.  This value should be high enough so that
+ * virtually the time, all matches found in the input buffer can fit in the
+ * match cache.  However, fallback behavior on cache overflow is still required.
+ */
+#define CACHE_RESERVE_PER_POS  8
  
  
-/* Internal compression parameters  */
-struct xpress_compressor_params {
+/*
+ * We use a binary-tree based matchfinder for optimal parsing because it can
+ * find more matches in the same number of steps compared to hash-chain based
+ * matchfinders.  In addition, since we need to find matches at almost every
+ * position, there isn't much penalty for keeping the sequences sorted in the
+ * binary trees.
+ */
+#include "wimlib/bt_matchfinder.h"
  
  
-       /* See xpress_choose_items()  */
-       u32 (*choose_items_func)(struct xpress_compressor *);
+struct xpress_optimum_node;
  
  
-       /* For near-optimal parsing only  */
-       u32 num_optim_passes;
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
  
  
-       /* Match-finding algorithm and parameters  */
-       enum lz_mf_algo mf_algo;
-       u32 max_search_depth;
-       u32 nice_match_length;
-};
+struct xpress_item;
  
  
-/* State of the XPRESS compressor  */
+/* The main XPRESS compressor structure  */
  struct xpress_compressor {
  
  struct xpress_compressor {
  
-       /* Internal compression parameters  */
-       struct xpress_compressor_params params;
-
-       /* Data currently being compressed  */
-       const u8 *cur_window;
-       u32 cur_window_size;
-
-       /* Lempel-Ziv match-finder  */
-       struct lz_mf *mf;
-
-       /* Optimal parsing data  */
-       unsigned (*get_matches_func)(struct xpress_compressor *,
-                                    const struct lz_match **);
-       void (*skip_bytes_func)(struct xpress_compressor *, unsigned n);
-       struct lz_match *cached_matches;
-       struct lz_match *cache_ptr;
-       struct lz_match *cache_limit;
-       struct xpress_mc_pos_data *optimum;
-       u8 costs[XPRESS_NUM_SYMBOLS];
-
-       /* The selected sequence of matches/literals  */
-       struct xpress_item *chosen_items;
+       /* Pointer to the compress() implementation chosen at allocation time */
+       size_t (*impl)(struct xpress_compressor *,
+                      const void *, size_t, void *, size_t);
  
  
-       /* Symbol frequency counters  */
+       /* Symbol frequency counters for the Huffman code  */
         u32 freqs[XPRESS_NUM_SYMBOLS];
  
         u32 freqs[XPRESS_NUM_SYMBOLS];
  
-       /* The current Huffman code  */
+       /* The Huffman codewords and their lengths  */
         u32 codewords[XPRESS_NUM_SYMBOLS];
         u8 lens[XPRESS_NUM_SYMBOLS];
         u32 codewords[XPRESS_NUM_SYMBOLS];
         u8 lens[XPRESS_NUM_SYMBOLS];
-};
  
  
-/* Intermediate XPRESS match/literal format  */
-struct xpress_item {
+       /* The "nice" match length: if a match of this length is found, then
+        * choose it immediately without further consideration.  */
+       unsigned nice_match_length;
  
  
-       /* Bits 0  -  8: Symbol
-        * Bits 9  - 24: Length - XPRESS_MIN_MATCH_LEN
-        * Bits 25 - 28: Number of extra offset bits
-        * Bits 29+    : Extra offset bits  */
+       /* The maximum search depth: consider at most this many potential
+        * matches at each position.  */
+       unsigned max_search_depth;
  
  
-       u64 data;
+       union {
+               /* Data for greedy or lazy parsing  */
+               struct {
+                       struct hc_matchfinder hc_mf;
+                       struct xpress_item *chosen_items;
+                       u8 nonoptimal_end[0];
+               };
+
+       #if SUPPORT_NEAR_OPTIMAL_PARSING
+               /* Data for near-optimal parsing  */
+               struct {
+                       struct bt_matchfinder bt_mf;
+                       struct xpress_optimum_node *optimum_nodes;
+                       struct lz_match *match_cache;
+                       struct lz_match *cache_overflow_mark;
+                       unsigned num_optim_passes;
+                       u32 costs[XPRESS_NUM_SYMBOLS];
+                       u8 optimal_end[0];
+               };
+       #endif
+       };
  };
  
  };
  
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
  /*
  /*
- * Match chooser position data:
+ * This structure represents a byte position in the input buffer and a node in
+ * the graph of possible match/literal choices.
+ *
+ * Logically, each incoming edge to this node is labeled with a literal or a
+ * match that can be taken to reach this position from an earlier position; and
+ * each outgoing edge from this node is labeled with a literal or a match that
+ * can be taken to advance from this position to a later position.
+ *
+ * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
+ * associate with each node just two pieces of information:
+ *
+ *     'cost_to_end' is the minimum cost to reach the end of the buffer from
+ *     this position.
   *
   *
- * An array of these structures is used during the near-optimal match-choosing
- * algorithm.  They correspond to consecutive positions in the window and are
- * used to keep track of the cost to reach each position, and the match/literal
- * choices that need to be chosen to reach that position.
+ *     'item' represents the literal or match that must be chosen from here to
+ *     reach the end of the buffer with the minimum cost.  Equivalently, this
+ *     can be interpreted as the label of the outgoing edge on the minimum cost
+ *     path to the "end of buffer" node from this node.
   */
   */
-struct xpress_mc_pos_data {
+struct xpress_optimum_node {
  
  
-       /* The cost, in bits, of the lowest-cost path that has been found to
-        * reach this position.  This can change as progressively lower cost
-        * paths are found to reach this position.  */
-       u32 cost;
-#define MC_INFINITE_COST UINT32_MAX
+       u32 cost_to_end;
  
  
-       /* The match or literal that was taken to reach this position.  This can
-        * change as progressively lower cost paths are found to reach this
-        * position.
+       /*
+        * Notes on the match/literal representation used here:
          *
          *
-        * This variable is divided into two bitfields.
+        *      The low bits of 'item' are the length: 1 if the item is a
+        *      literal, or the match length if the item is a match.
          *
          *
-        * Literals:
-        *      Low bits are 1, high bits are the literal.
-        *
-        * Matches:
-        *      Low bits are the match length, high bits are the offset.
+        *      The high bits of 'item' are the actual literal byte if the item
+        *      is a literal, or the match offset if the item is a match.
          */
          */
-       u32 mc_item_data;
-#define MC_OFFSET_SHIFT 16
-#define MC_LEN_MASK (((u32)1 << MC_OFFSET_SHIFT) - 1)
+#define OPTIMUM_OFFSET_SHIFT   16
+#define OPTIMUM_LEN_MASK       (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
+       u32 item;
  };
  
  };
  
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+/* An intermediate representation of an XPRESS match or literal  */
+struct xpress_item {
+       /*
+        * Bits 0  -  8: Symbol
+        * Bits 9  - 24: Length - XPRESS_MIN_MATCH_LEN
+        * Bits 25 - 28: Number of extra offset bits
+        * Bits 29+    : Extra offset bits
+        *
+        * Unfortunately, gcc generates worse code if we use real bitfields here.
+        */
+       u64 data;
+};
  
  /*
  
  /*
- * Structure to keep track of the current state of sending data to the
- * compressed output buffer.
+ * Structure to keep track of the current state of sending compressed data to
+ * the output buffer.
   *
   * The XPRESS bitstream is encoded as a sequence of little endian 16-bit coding
   * units interwoven with literal bytes.
   *
   * The XPRESS bitstream is encoded as a sequence of little endian 16-bit coding
   * units interwoven with literal bytes.
@@ -174,18 +231,38 @@ struct xpress_output_bitstream {
         u8 *end;
  };
  
         u8 *end;
  };
  
+/* Reset the symbol frequencies for the XPRESS Huffman code.  */
+static void
+xpress_reset_symbol_frequencies(struct xpress_compressor *c)
+{
+       memset(c->freqs, 0, sizeof(c->freqs));
+}
+
+/*
+ * Make the Huffman code for XPRESS.
+ *
+ * Input: c->freqs
+ * Output: c->lens and c->codewords
+ */
+static void
+xpress_make_huffman_code(struct xpress_compressor *c)
+{
+       make_canonical_huffman_code(XPRESS_NUM_SYMBOLS, XPRESS_MAX_CODEWORD_LEN,
+                                   c->freqs, c->lens, c->codewords);
+}
+
  /*
   * Initialize the output bitstream.
   *
   * @os
   *     The output bitstream structure to initialize.
   * @buffer
  /*
   * Initialize the output bitstream.
   *
   * @os
   *     The output bitstream structure to initialize.
   * @buffer
- *     The buffer to write to.
+ *     The output buffer.
   * @size
   *     Size of @buffer, in bytes.  Must be at least 4.
   */
  static void
   * @size
   *     Size of @buffer, in bytes.  Must be at least 4.
   */
  static void
-xpress_init_output(struct xpress_output_bitstream *os, void *buffer, u32 size)
+xpress_init_output(struct xpress_output_bitstream *os, void *buffer, size_t size)
  {
         os->bitbuf = 0;
         os->bitcount = 0;
  {
         os->bitbuf = 0;
         os->bitcount = 0;
@@ -207,7 +284,7 @@ xpress_init_output(struct xpress_output_bitstream *os, void *buffer, u32 size)
   */
  static inline void
  xpress_write_bits(struct xpress_output_bitstream *os,
   */
  static inline void
  xpress_write_bits(struct xpress_output_bitstream *os,
-                 const u32 bits, const unsigned int num_bits)
+                 const u32 bits, const unsigned num_bits)
  {
         /* This code is optimized for XPRESS, which never needs to write more
          * than 16 bits at once.  */
  {
         /* This code is optimized for XPRESS, which never needs to write more
          * than 16 bits at once.  */
@@ -236,14 +313,26 @@ xpress_write_byte(struct xpress_output_bitstream *os, u8 byte)
                 *os->next_byte++ = byte;
  }
  
                 *os->next_byte++ = byte;
  }
  
+/*
+ * Interweave two literal bytes into the output bitstream.
+ */
+static inline void
+xpress_write_u16(struct xpress_output_bitstream *os, u16 v)
+{
+       if (os->end - os->next_byte >= 2) {
+               put_unaligned_u16_le(v, os->next_byte);
+               os->next_byte += 2;
+       }
+}
+
  /*
   * Flush the last coding unit to the output buffer if needed.  Return the total
   * number of bytes written to the output buffer, or 0 if an overflow occurred.
   */
  /*
   * Flush the last coding unit to the output buffer if needed.  Return the total
   * number of bytes written to the output buffer, or 0 if an overflow occurred.
   */
-static u32
+static size_t
  xpress_flush_output(struct xpress_output_bitstream *os)
  {
  xpress_flush_output(struct xpress_output_bitstream *os)
  {
-       if (unlikely(os->end - os->next_byte < 2))
+       if (os->end - os->next_byte < 2)
                 return 0;
  
         put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next_bits);
                 return 0;
  
         put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next_bits);
@@ -252,986 +341,794 @@ xpress_flush_output(struct xpress_output_bitstream *os)
         return os->next_byte - os->start;
  }
  
         return os->next_byte - os->start;
  }
  
+static inline void
+xpress_write_extra_length_bytes(struct xpress_output_bitstream *os,
+                               unsigned adjusted_len)
+{
+       /* If length >= 18, output one extra length byte.
+        * If length >= 273, output three (total) extra length bytes.  */
+       if (adjusted_len >= 0xF) {
+               u8 byte1 = min(adjusted_len - 0xF, 0xFF);
+               xpress_write_byte(os, byte1);
+               if (byte1 == 0xFF)
+                       xpress_write_u16(os, adjusted_len);
+       }
+}
+
  /* Output a match or literal.  */
  static inline void
  xpress_write_item(struct xpress_item item, struct xpress_output_bitstream *os,
                   const u32 codewords[], const u8 lens[])
  {
         u64 data = item.data;
  /* Output a match or literal.  */
  static inline void
  xpress_write_item(struct xpress_item item, struct xpress_output_bitstream *os,
                   const u32 codewords[], const u8 lens[])
  {
         u64 data = item.data;
-       unsigned symbol;
-       unsigned adjusted_len;
-       unsigned num_extra_bits;
-       unsigned extra_bits;
-
-       symbol = data & 0x1FF;
+       unsigned symbol = data & 0x1FF;
  
         xpress_write_bits(os, codewords[symbol], lens[symbol]);
  
  
         xpress_write_bits(os, codewords[symbol], lens[symbol]);
  
-       if (symbol < XPRESS_NUM_CHARS)  /* Literal?  */
-               return;
-
-       adjusted_len = (data >> 9) & 0xFFFF;
-
-       /* If length >= 18, one extra length byte.
-        * If length >= 273, three (total) extra length bytes.  */
-       if (adjusted_len >= 0xf) {
-               u8 byte1 = min(adjusted_len - 0xf, 0xff);
-               xpress_write_byte(os, byte1);
-               if (byte1 == 0xff) {
-                       xpress_write_byte(os, adjusted_len & 0xff);
-                       xpress_write_byte(os, adjusted_len >> 8);
-               }
+       if (symbol >= XPRESS_NUM_CHARS) {
+               /* Match, not a literal  */
+               xpress_write_extra_length_bytes(os, (data >> 9) & 0xFFFF);
+               xpress_write_bits(os, data >> 29, (data >> 25) & 0xF);
         }
         }
-
-       num_extra_bits = (data >> 25) & 0xF;
-       extra_bits = data >> 29;
-
-       xpress_write_bits(os, extra_bits, num_extra_bits);
  }
  
  /* Output a sequence of XPRESS matches and literals.  */
  static void
  xpress_write_items(struct xpress_output_bitstream *os,
  }
  
  /* Output a sequence of XPRESS matches and literals.  */
  static void
  xpress_write_items(struct xpress_output_bitstream *os,
-                  const struct xpress_item items[], u32 num_items,
+                  const struct xpress_item items[], size_t num_items,
                    const u32 codewords[], const u8 lens[])
  {
                    const u32 codewords[], const u8 lens[])
  {
-       for (u32 i = 0; i < num_items; i++)
+       for (size_t i = 0; i < num_items; i++)
                 xpress_write_item(items[i], os, codewords, lens);
                 xpress_write_item(items[i], os, codewords, lens);
-
-       /* End-of-data symbol (required for MS compatibility)  */
-       xpress_write_bits(os, codewords[XPRESS_END_OF_DATA], lens[XPRESS_END_OF_DATA]);
  }
  
  }
  
-/* Make the Huffman code for XPRESS.
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/*
+ * Follow the minimum cost path in the graph of possible match/literal choices
+ * and write out the matches/literals using the specified Huffman code.
   *
   *
- * Takes as input c->freqs and produces as output c->lens and c->codewords.  */
+ * Note: this is slightly duplicated with xpress_write_items().  However, we
+ * don't want to waste time translating between intermediate match/literal
+ * representations.
+ */
  static void
  static void
-xpress_make_huffman_code(struct xpress_compressor *c)
-{
-       make_canonical_huffman_code(XPRESS_NUM_SYMBOLS, XPRESS_MAX_CODEWORD_LEN,
-                                   c->freqs, c->lens, c->codewords);
-}
-
-/* Tally, and optionally record, the specified literal byte.  */
-static inline void
-xpress_declare_literal(struct xpress_compressor *c, unsigned literal,
-                      struct xpress_item **next_chosen_item)
-{
-       c->freqs[literal]++;
-
-       if (next_chosen_item) {
-               *(*next_chosen_item)++ = (struct xpress_item) {
-                       .data = literal,
-               };
-       }
-}
-
-/* Tally, and optionally record, the specified match.  */
-static inline void
-xpress_declare_match(struct xpress_compressor *c,
-                    unsigned len, unsigned offset,
-                    struct xpress_item **next_chosen_item)
+xpress_write_item_list(struct xpress_output_bitstream *os,
+                      struct xpress_optimum_node *optimum_nodes,
+                      size_t count, const u32 codewords[], const u8 lens[])
  {
  {
-       unsigned adjusted_len = len - XPRESS_MIN_MATCH_LEN;
-       unsigned len_hdr = min(adjusted_len, 0xf);
-       unsigned offset_high_bit = fls32(offset);
-       unsigned sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
+       struct xpress_optimum_node *cur_optimum_ptr = optimum_nodes;
+       struct xpress_optimum_node *end_optimum_ptr = optimum_nodes + count;
+       do {
+               unsigned length = cur_optimum_ptr->item & OPTIMUM_LEN_MASK;
+               unsigned offset = cur_optimum_ptr->item >> OPTIMUM_OFFSET_SHIFT;
  
  
-       c->freqs[sym]++;
+               if (length == 1) {
+                       /* Literal  */
+                       unsigned literal = offset;
  
  
-       if (next_chosen_item) {
-               *(*next_chosen_item)++ = (struct xpress_item) {
-                       .data = (u64)sym |
-                               ((u64)adjusted_len << 9) |
-                               ((u64)offset_high_bit << 25) |
-                               ((u64)(offset ^ (1U << offset_high_bit)) << 29),
-               };
-       }
-}
+                       xpress_write_bits(os, codewords[literal], lens[literal]);
+               } else {
+                       /* Match  */
+                       unsigned adjusted_len;
+                       unsigned offset_high_bit;
+                       unsigned len_hdr;
+                       unsigned sym;
  
  
-/* Tally, and optionally record, the specified match or literal.  */
-static inline void
-xpress_declare_item(struct xpress_compressor *c, u32 mc_item_data,
-                   struct xpress_item **next_chosen_item)
-{
-       unsigned len = mc_item_data & MC_LEN_MASK;
-       unsigned offset_data = mc_item_data >> MC_OFFSET_SHIFT;
+                       adjusted_len = length - XPRESS_MIN_MATCH_LEN;
+                       offset_high_bit = fls32(offset);
+                       len_hdr = min(0xF, adjusted_len);
+                       sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
  
  
-       if (len == 1)
-               xpress_declare_literal(c, offset_data, next_chosen_item);
-       else
-               xpress_declare_match(c, len, offset_data, next_chosen_item);
+                       xpress_write_bits(os, codewords[sym], lens[sym]);
+                       xpress_write_extra_length_bytes(os, adjusted_len);
+                       xpress_write_bits(os, offset - (1U << offset_high_bit),
+                                         offset_high_bit);
+               }
+               cur_optimum_ptr += length;
+       } while (cur_optimum_ptr != end_optimum_ptr);
  }
  }
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
  
  
-static inline void
-xpress_record_item_list(struct xpress_compressor *c,
-                       struct xpress_mc_pos_data *cur_optimum_ptr,
-                       struct xpress_item **next_chosen_item)
+/*
+ * Output the XPRESS-compressed data, given the sequence of match/literal
+ * "items" that was chosen to represent the input data.
+ *
+ * If @near_optimal is %false, then the items are taken from the array
+ * c->chosen_items[0...count].
+ *
+ * If @near_optimal is %true, then the items are taken from the minimum cost
+ * path stored in c->optimum_nodes[0...count].
+ */
+static size_t
+xpress_write(struct xpress_compressor *c, void *out, size_t out_nbytes_avail,
+            size_t count, bool near_optimal)
  {
  {
-       struct xpress_mc_pos_data *end_optimum_ptr;
-       u32 saved_item;
-       u32 item;
+       u8 *cptr;
+       struct xpress_output_bitstream os;
+       size_t out_size;
  
  
-       /* The list is currently in reverse order (last item to first item).
-        * Reverse it.  */
-       end_optimum_ptr = cur_optimum_ptr;
-       saved_item = cur_optimum_ptr->mc_item_data;
-       do {
-               item = saved_item;
-               cur_optimum_ptr -= item & MC_LEN_MASK;
-               saved_item = cur_optimum_ptr->mc_item_data;
-               cur_optimum_ptr->mc_item_data = item;
-       } while (cur_optimum_ptr != c->optimum);
-
-       /* Walk the list of items from beginning to end, tallying and recording
-        * each item.  */
-       do {
-               xpress_declare_item(c, cur_optimum_ptr->mc_item_data, next_chosen_item);
-               cur_optimum_ptr += (cur_optimum_ptr->mc_item_data) & MC_LEN_MASK;
-       } while (cur_optimum_ptr != end_optimum_ptr);
-}
+       /* Account for the end-of-data symbol and make the Huffman code.  */
+       c->freqs[XPRESS_END_OF_DATA]++;
+       xpress_make_huffman_code(c);
  
  
-static inline void
-xpress_tally_item_list(struct xpress_compressor *c,
-                      struct xpress_mc_pos_data *cur_optimum_ptr)
-{
-       /* Since we're just tallying the items, we don't need to reverse the
-        * list.  Processing the items in reverse order is fine.  */
-       do {
-               xpress_declare_item(c, cur_optimum_ptr->mc_item_data, NULL);
-               cur_optimum_ptr -= (cur_optimum_ptr->mc_item_data & MC_LEN_MASK);
-       } while (cur_optimum_ptr != c->optimum);
-}
+       /* Output the Huffman code as a series of 512 4-bit lengths.  */
+       cptr = out;
+       for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i += 2)
+               *cptr++ = (c->lens[i + 1] << 4) | c->lens[i];
  
  
-/* Tally, and optionally (if next_chosen_item != NULL) record, in order, all
- * items in the current list of items found by the match-chooser.  */
-static void
-xpress_declare_item_list(struct xpress_compressor *c,
-                        struct xpress_mc_pos_data *cur_optimum_ptr,
-                        struct xpress_item **next_chosen_item)
-{
-       if (next_chosen_item)
-               xpress_record_item_list(c, cur_optimum_ptr, next_chosen_item);
-       else
-               xpress_tally_item_list(c, cur_optimum_ptr);
-}
+       xpress_init_output(&os, cptr, out_nbytes_avail - XPRESS_NUM_SYMBOLS / 2);
  
  
-static unsigned
-xpress_get_matches_fillcache(struct xpress_compressor *c,
-                            const struct lz_match **matches_ret)
-{
-       struct lz_match *cache_ptr;
-       struct lz_match *matches;
-       unsigned num_matches;
-
-       cache_ptr = c->cache_ptr;
-       matches = cache_ptr + 1;
-       if (likely(cache_ptr <= c->cache_limit)) {
-               num_matches = lz_mf_get_matches(c->mf, matches);
-               cache_ptr->len = num_matches;
-               c->cache_ptr = matches + num_matches;
-       } else {
-               num_matches = 0;
-       }
-       *matches_ret = matches;
-       return num_matches;
-}
+       /* Output the Huffman-encoded items.  */
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+       if (near_optimal) {
+               xpress_write_item_list(&os, c->optimum_nodes, count,
+                                      c->codewords, c->lens);
  
  
-static unsigned
-xpress_get_matches_usecache(struct xpress_compressor *c,
-                           const struct lz_match **matches_ret)
-{
-       struct lz_match *cache_ptr;
-       struct lz_match *matches;
-       unsigned num_matches;
-
-       cache_ptr = c->cache_ptr;
-       matches = cache_ptr + 1;
-       if (cache_ptr <= c->cache_limit) {
-               num_matches = cache_ptr->len;
-               c->cache_ptr = matches + num_matches;
-       } else {
-               num_matches = 0;
+       } else
+#endif
+       {
+               xpress_write_items(&os, c->chosen_items, count,
+                                  c->codewords, c->lens);
         }
         }
-       *matches_ret = matches;
-       return num_matches;
-}
-
-static unsigned
-xpress_get_matches_usecache_nocheck(struct xpress_compressor *c,
-                                   const struct lz_match **matches_ret)
-{
-       struct lz_match *cache_ptr;
-       struct lz_match *matches;
-       unsigned num_matches;
-
-       cache_ptr = c->cache_ptr;
-       matches = cache_ptr + 1;
-       num_matches = cache_ptr->len;
-       c->cache_ptr = matches + num_matches;
-       *matches_ret = matches;
-       return num_matches;
-}
  
  
-static unsigned
-xpress_get_matches_noncaching(struct xpress_compressor *c,
-                             const struct lz_match **matches_ret)
-{
-       *matches_ret = c->cached_matches;
-       return lz_mf_get_matches(c->mf, c->cached_matches);
-}
+       /* Write the end-of-data symbol (needed for MS compatibility)  */
+       xpress_write_bits(&os, c->codewords[XPRESS_END_OF_DATA],
+                         c->lens[XPRESS_END_OF_DATA]);
  
  
-/*
- * Find matches at the next position in the window.
- *
- * This uses a wrapper function around the underlying match-finder.
- *
- * Returns the number of matches found and sets *matches_ret to point to the
- * matches array.  The matches will be sorted by strictly increasing length and
- * offset.
- */
-static inline unsigned
-xpress_get_matches(struct xpress_compressor *c,
-                  const struct lz_match **matches_ret)
-{
-       return (*c->get_matches_func)(c, matches_ret);
-}
+       /* Flush any pending data.  Then return the compressed size if the
+        * compressed data fit in the output buffer, or 0 if it did not.  */
+       out_size = xpress_flush_output(&os);
+       if (out_size == 0)
+               return 0;
  
  
-static void
-xpress_skip_bytes_fillcache(struct xpress_compressor *c, unsigned n)
-{
-       struct lz_match *cache_ptr;
-
-       cache_ptr = c->cache_ptr;
-       lz_mf_skip_positions(c->mf, n);
-       if (cache_ptr <= c->cache_limit) {
-               do {
-                       cache_ptr->len = 0;
-                       cache_ptr += 1;
-               } while (--n && likely(cache_ptr <= c->cache_limit));
-       }
-       c->cache_ptr = cache_ptr;
+       return out_size + XPRESS_NUM_SYMBOLS / 2;
  }
  
  }
  
-static void
-xpress_skip_bytes_usecache(struct xpress_compressor *c, unsigned n)
+/* Tally the Huffman symbol for a literal and return the intermediate
+ * representation of that literal.  */
+static inline struct xpress_item
+xpress_record_literal(struct xpress_compressor *c, unsigned literal)
  {
  {
-       struct lz_match *cache_ptr;
+       c->freqs[literal]++;
  
  
-       cache_ptr = c->cache_ptr;
-       if (likely(cache_ptr <= c->cache_limit)) {
-               do {
-                       cache_ptr += 1 + cache_ptr->len;
-               } while (--n && likely(cache_ptr <= c->cache_limit));
-       }
-       c->cache_ptr = cache_ptr;
+       return (struct xpress_item) {
+               .data = literal,
+       };
  }
  
  }
  
-static void
-xpress_skip_bytes_usecache_nocheck(struct xpress_compressor *c, unsigned n)
+/* Tally the Huffman symbol for a match and return the intermediate
+ * representation of that match.  */
+static inline struct xpress_item
+xpress_record_match(struct xpress_compressor *c, unsigned length, unsigned offset)
  {
  {
-       struct lz_match *cache_ptr;
+       unsigned adjusted_len = length - XPRESS_MIN_MATCH_LEN;
+       unsigned len_hdr = min(adjusted_len, 0xF);
+       unsigned offset_high_bit = fls32(offset);
+       unsigned sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
  
  
-       cache_ptr = c->cache_ptr;
-       do {
-               cache_ptr += 1 + cache_ptr->len;
-       } while (--n);
-       c->cache_ptr = cache_ptr;
-}
+       c->freqs[sym]++;
  
  
-static void
-xpress_skip_bytes_noncaching(struct xpress_compressor *c, unsigned n)
-{
-       lz_mf_skip_positions(c->mf, n);
+       return (struct xpress_item) {
+               .data = (u64)sym |
+                       ((u64)adjusted_len << 9) |
+                       ((u64)offset_high_bit << 25) |
+                       ((u64)(offset ^ (1U << offset_high_bit)) << 29),
+       };
  }
  
  /*
  }
  
  /*
- * Skip the specified number of positions in the window (don't search for
- * matches at them).
- *
- * This uses a wrapper function around the underlying match-finder.
+ * This is the "greedy" XPRESS compressor. It always chooses the longest match.
+ * (Exception: as a heuristic, we pass up length 3 matches that have large
+ * offsets.)
   */
   */
-static inline void
-xpress_skip_bytes(struct xpress_compressor *c, unsigned n)
-{
-       return (*c->skip_bytes_func)(c, n);
-}
-
-/* Set default XPRESS Huffman symbol costs to bootstrap the iterative
- * optimization algorithm.  */
-static void
-xpress_set_default_costs(u8 costs[])
+static size_t
+xpress_compress_greedy(struct xpress_compressor * restrict c,
+                      const void * restrict in, size_t in_nbytes,
+                      void * restrict out, size_t out_nbytes_avail)
  {
  {
-       unsigned i;
-
-       /* Literal symbols  */
-       for (i = 0; i < XPRESS_NUM_CHARS; i++)
-               costs[i] = 8;
+       const u8 * const in_base = in;
+       const u8 *       in_next = in_base;
+       const u8 * const in_end = in_base + in_nbytes;
+       struct xpress_item *next_chosen_item = c->chosen_items;
+       unsigned len_3_too_far;
  
  
-       /* Match symbols  */
-       for (; i < XPRESS_NUM_SYMBOLS; i++)
-               costs[i] = 10;
-}
+       if (in_nbytes <= 8192)
+               len_3_too_far = 2048;
+       else
+               len_3_too_far = 4096;
  
  
-/* Copy the Huffman codeword lengths array @lens to the Huffman symbol costs
- * array @costs, but also assign a default cost to each 0-length (unused)
- * codeword.  */
-static void
-xpress_set_costs(u8 costs[], const u8 lens[])
-{
-       for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i++)
-               costs[i] = lens[i] ? lens[i] : XPRESS_MAX_CODEWORD_LEN;
-}
+       hc_matchfinder_init(&c->hc_mf);
  
  
-/*
- * Consider coding each match in @matches.
- *
- * @matches must be sorted by strictly increasing length and strictly
- * increasing offset.  This is guaranteed by the match-finder.
- *
- * We consider each length from the minimum (3) to the longest
- * (matches[num_matches - 1].len).  For each length, we consider only
- * the smallest offset for which that length is available.  Although
- * this is not guaranteed to be optimal due to the possibility of a
- * larger offset costing less than a smaller offset to code, this is a
- * very useful heuristic.
- */
-static inline void
-xpress_consider_matches(struct xpress_compressor *c,
-                       struct xpress_mc_pos_data *cur_optimum_ptr,
-                       const struct lz_match matches[],
-                       unsigned num_matches)
-{
-       unsigned i = 0;
-       unsigned len = XPRESS_MIN_MATCH_LEN;
-       u32 cost;
-       u32 position_cost;
-       unsigned offset;
-       unsigned offset_high_bit;
-       unsigned adjusted_len;
-       unsigned len_hdr;
-       unsigned sym;
-
-       if (matches[num_matches - 1].len < 0xf + XPRESS_MIN_MATCH_LEN) {
-               /* All lengths are small.  Optimize accordingly.  */
-               do {
-                       offset = matches[i].offset;
-                       offset_high_bit = fls32(offset);
-                       len_hdr = len - XPRESS_MIN_MATCH_LEN;
-                       sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
+       do {
+               unsigned length;
+               unsigned offset;
+
+               length = hc_matchfinder_longest_match(&c->hc_mf,
+                                                     in_base,
+                                                     in_next,
+                                                     XPRESS_MIN_MATCH_LEN - 1,
+                                                     in_end - in_next,
+                                                     min(in_end - in_next, c->nice_match_length),
+                                                     c->max_search_depth,
+                                                     &offset);
+               if (length >= XPRESS_MIN_MATCH_LEN &&
+                   !(length == XPRESS_MIN_MATCH_LEN && offset >= len_3_too_far))
+               {
+                       /* Match found  */
+                       *next_chosen_item++ =
+                               xpress_record_match(c, length, offset);
+                       in_next += 1;
+                       hc_matchfinder_skip_positions(&c->hc_mf,
+                                                     in_base,
+                                                     in_next,
+                                                     in_end,
+                                                     length - 1);
+                       in_next += length - 1;
+               } else {
+                       /* No match found  */
+                       *next_chosen_item++ =
+                               xpress_record_literal(c, *in_next);
+                       in_next += 1;
+               }
+       } while (in_next != in_end);
  
  
-                       position_cost = cur_optimum_ptr->cost + offset_high_bit;
-                       do {
-                               cost = position_cost + c->costs[sym];
-                               if (cost < (cur_optimum_ptr + len)->cost) {
-                                       (cur_optimum_ptr + len)->cost = cost;
-                                       (cur_optimum_ptr + len)->mc_item_data =
-                                               (offset << MC_OFFSET_SHIFT) | len;
-                               }
-                               sym++;
-                       } while (++len <= matches[i].len);
-               } while (++i != num_matches);
-       } else {
-               /* Some lengths are big.  */
-               do {
-                       offset = matches[i].offset;
-                       offset_high_bit = fls32(offset);
-                       position_cost = cur_optimum_ptr->cost + offset_high_bit;
-                       do {
-                               adjusted_len = len - XPRESS_MIN_MATCH_LEN;
-                               len_hdr = min(adjusted_len, 0xf);
-                               sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
-
-                               cost = position_cost + c->costs[sym];
-                               if (adjusted_len >= 0xf) {
-                                       cost += 8;
-                                       if (adjusted_len - 0xf >= 0xff)
-                                               cost += 16;
-                               }
-
-                               if (cost < (cur_optimum_ptr + len)->cost) {
-                                       (cur_optimum_ptr + len)->cost = cost;
-                                       (cur_optimum_ptr + len)->mc_item_data =
-                                               (offset << MC_OFFSET_SHIFT) | len;
-                               }
-                       } while (++len <= matches[i].len);
-               } while (++i != num_matches);
-       }
+       return xpress_write(c, out, out_nbytes_avail,
+                           next_chosen_item - c->chosen_items, false);
  }
  
  /*
  }
  
  /*
- * The main near-optimal parsing routine.
- *
- * Briefly, the algorithm does an approximate minimum-cost path search to find a
- * "near-optimal" sequence of matches and literals to output, based on the
- * current cost model.  The algorithm steps forward, position by position (byte
- * by byte), and updates the minimum cost path to reach each later position that
- * can be reached using a match or literal from the current position.  This is
- * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
- * the graph edges are possible matches/literals to code, and the cost of each
- * edge is the estimated number of bits that will be required to output the
- * corresponding match or literal.  But one difference is that we actually
- * compute the lowest-cost path in pieces, where each piece is terminated when
- * there are no choices to be made.
- *
- * If next_chosen_item != NULL, then all items chosen will be recorded (saved in
- * the chosen_items array).  Otherwise, all items chosen will only be tallied
- * (symbol frequencies tallied in c->freqs).
+ * This is the "lazy" XPRESS compressor.  Before choosing a match, it checks to
+ * see if there's a longer match at the next position.  If yes, it outputs a
+ * literal and continues to the next position.  If no, it outputs the match.
   */
   */
-static void
-xpress_optim_pass(struct xpress_compressor *c,
-                 struct xpress_item **next_chosen_item)
+static size_t
+xpress_compress_lazy(struct xpress_compressor * restrict c,
+                    const void * restrict in, size_t in_nbytes,
+                    void * restrict out, size_t out_nbytes_avail)
  {
  {
-       const u8 *window_end;
-       const u8 *window_ptr;
-       struct xpress_mc_pos_data *cur_optimum_ptr;
-       struct xpress_mc_pos_data *end_optimum_ptr;
-       const struct lz_match *matches;
-       unsigned num_matches;
-       unsigned longest_len;
-       unsigned literal;
-       u32 cost;
-
-       window_ptr = c->cur_window;
-       window_end = &c->cur_window[c->cur_window_size];
-
-begin:
-       /* Start building a new list of items, which will correspond to the next
-        * piece of the overall minimum-cost path.  */
-
-       if (window_ptr == window_end)
-               return;
-
-       cur_optimum_ptr = c->optimum;
-       cur_optimum_ptr->cost = 0;
-       end_optimum_ptr = cur_optimum_ptr;
-
-       /* The following loop runs once for each per byte in the window, except
-        * in a couple shortcut cases.  */
-       for (;;) {
-
-               /* Find matches with the current position.  */
-               num_matches = xpress_get_matches(c, &matches);
-
-               if (num_matches) {
-
-                       longest_len = matches[num_matches - 1].len;
+       const u8 * const in_base = in;
+       const u8 *       in_next = in_base;
+       const u8 * const in_end = in_base + in_nbytes;
+       struct xpress_item *next_chosen_item = c->chosen_items;
+       unsigned len_3_too_far;
  
  
-                       /* If there's a very long match, choose it immediately.
-                        */
-                       if (longest_len >= c->params.nice_match_length) {
+       if (in_nbytes <= 8192)
+               len_3_too_far = 2048;
+       else
+               len_3_too_far = 4096;
  
  
-                               xpress_skip_bytes(c, longest_len - 1);
-                               window_ptr += longest_len;
+       hc_matchfinder_init(&c->hc_mf);
  
  
-                               if (cur_optimum_ptr != c->optimum)
-                                       xpress_declare_item_list(c, cur_optimum_ptr,
-                                                                next_chosen_item);
+       do {
+               unsigned cur_len;
+               unsigned cur_offset;
+               unsigned next_len;
+               unsigned next_offset;
+
+               /* Find the longest match at the current position.  */
+               cur_len = hc_matchfinder_longest_match(&c->hc_mf,
+                                                      in_base,
+                                                      in_next,
+                                                      XPRESS_MIN_MATCH_LEN - 1,
+                                                      in_end - in_next,
+                                                      min(in_end - in_next, c->nice_match_length),
+                                                      c->max_search_depth,
+                                                      &cur_offset);
+               in_next += 1;
+
+               if (cur_len < XPRESS_MIN_MATCH_LEN ||
+                   (cur_len == XPRESS_MIN_MATCH_LEN &&
+                    cur_offset >= len_3_too_far))
+               {
+                       /* No match found.  Choose a literal.  */
+                       *next_chosen_item++ =
+                               xpress_record_literal(c, *(in_next - 1));
+                       continue;
+               }
  
  
-                               xpress_declare_match(c, longest_len,
-                                                    matches[num_matches - 1].offset,
-                                                    next_chosen_item);
-                               goto begin;
-                       }
+       have_cur_match:
+               /* We have a match at the current position.  */
  
  
-                       /* If reaching any positions for the first time,
-                        * initialize their costs to "infinity".  */
-                       while (end_optimum_ptr < cur_optimum_ptr + longest_len)
-                               (++end_optimum_ptr)->cost = MC_INFINITE_COST;
+               /* If the current match is very long, choose it immediately.  */
+               if (cur_len >= c->nice_match_length) {
  
  
-                       /* Consider coding a match.  */
-                       xpress_consider_matches(c, cur_optimum_ptr,
-                                               matches, num_matches);
-               } else {
-                       /* No matches found.  The only choice at this position
-                        * is to code a literal.  */
-
-                       if (end_optimum_ptr == cur_optimum_ptr) {
-                       #if 1
-                               /* Optimization for single literals.  */
-                               if (likely(cur_optimum_ptr == c->optimum)) {
-                                       xpress_declare_literal(c, *window_ptr++,
-                                                              next_chosen_item);
-                                       if (window_ptr == window_end)
-                                               return;
-                                       continue;
-                               }
-                       #endif
-                               (++end_optimum_ptr)->cost = MC_INFINITE_COST;
-                       }
-               }
+                       *next_chosen_item++ =
+                               xpress_record_match(c, cur_len, cur_offset);
  
  
-               /* Consider coding a literal.  */
-               literal = *window_ptr++;
-               cost = cur_optimum_ptr->cost + c->costs[literal];
-               if (cost < (cur_optimum_ptr + 1)->cost) {
-                       (cur_optimum_ptr + 1)->cost = cost;
-                       (cur_optimum_ptr + 1)->mc_item_data =
-                               ((u32)literal << MC_OFFSET_SHIFT) | 1;
+                       hc_matchfinder_skip_positions(&c->hc_mf,
+                                                     in_base,
+                                                     in_next,
+                                                     in_end,
+                                                     cur_len - 1);
+                       in_next += cur_len - 1;
+                       continue;
                 }
  
                 }
  
-               /* Advance to the next position.  */
-               cur_optimum_ptr++;
-
                 /*
                 /*
-                * This loop will terminate when either of the following
-                * conditions is true:
-                *
-                * (1) cur_optimum_ptr == end_optimum_ptr
+                * Try to find a match at the next position.
                  *
                  *
-                *      There are no paths that extend beyond the current
-                *      position.  In this case, any path to a later position
-                *      must pass through the current position, so we can go
-                *      ahead and choose the list of items that led to this
-                *      position.
+                * Note: since we already have a match at the *current*
+                * position, we use only half the 'max_search_depth' when
+                * checking the *next* position.  This is a useful trade-off
+                * because it's more worthwhile to use a greater search depth on
+                * the initial match than on the next match (since a lot of the
+                * time, that next match won't even be used).
                  *
                  *
-                * (2) cur_optimum_ptr == &c->optimum[XPRESS_OPTIM_ARRAY_LENGTH]
-                *
-                *      This bounds the number of times the algorithm can step
-                *      forward before it is guaranteed to start choosing items.
-                *      This limits the memory usage.  But
-                *      XPRESS_OPTIM_ARRAY_LENGTH is high enough that on most
-                *      inputs this limit is never reached.
-                *
-                * Note: no check for end-of-window is needed because
-                * end-of-window will trigger condition (1).
+                * Note: it's possible to structure the code such that there's
+                * only one call to longest_match(), which handles both the
+                * "find the initial match" and "try to find a longer match"
+                * cases.  However, it is faster to have two call sites, with
+                * longest_match() inlined at each.
                  */
                  */
-               if (cur_optimum_ptr == end_optimum_ptr ||
-                   cur_optimum_ptr == &c->optimum[XPRESS_OPTIM_ARRAY_LENGTH])
-                       break;
-       }
+               next_len = hc_matchfinder_longest_match(&c->hc_mf,
+                                                       in_base,
+                                                       in_next,
+                                                       cur_len,
+                                                       in_end - in_next,
+                                                       min(in_end - in_next, c->nice_match_length),
+                                                       c->max_search_depth / 2,
+                                                       &next_offset);
+               in_next += 1;
+
+               if (next_len > cur_len) {
+                       /* Found a longer match at the next position, so output
+                        * a literal.  */
+                       *next_chosen_item++ =
+                               xpress_record_literal(c, *(in_next - 2));
+                       cur_len = next_len;
+                       cur_offset = next_offset;
+                       goto have_cur_match;
+               } else {
+                       /* Didn't find a longer match at the next position, so
+                        * output the current match.  */
+                       *next_chosen_item++ =
+                               xpress_record_match(c, cur_len, cur_offset);
+                       hc_matchfinder_skip_positions(&c->hc_mf,
+                                                     in_base,
+                                                     in_next,
+                                                     in_end,
+                                                     cur_len - 2);
+                       in_next += cur_len - 2;
+                       continue;
+               }
+       } while (in_next != in_end);
  
  
-       /* Choose the current list of items that constitute the minimum-cost
-        * path to the current position.  */
-       xpress_declare_item_list(c, cur_optimum_ptr, next_chosen_item);
-       goto begin;
+       return xpress_write(c, out, out_nbytes_avail,
+                           next_chosen_item - c->chosen_items, false);
  }
  
  }
  
-/* Near-optimal parsing  */
-static u32
-xpress_choose_near_optimal_items(struct xpress_compressor *c)
-{
-       u32 num_passes_remaining = c->params.num_optim_passes;
-       struct xpress_item *next_chosen_item;
-       struct xpress_item **next_chosen_item_ptr;
-
-       /* Choose appropriate match-finder wrapper functions.  */
-       if (c->params.num_optim_passes > 1) {
-               c->get_matches_func = xpress_get_matches_fillcache;
-               c->skip_bytes_func = xpress_skip_bytes_fillcache;
-       } else {
-               c->get_matches_func = xpress_get_matches_noncaching;
-               c->skip_bytes_func = xpress_skip_bytes_noncaching;
-       }
-
-       /* The first optimization pass will use a default cost model.  Each
-        * additional optimization pass will use a cost model computed from the
-        * previous pass.
-        *
-        * To improve performance, we only generate the array containing the
-        * matches and literals in intermediate form on the final pass.  For
-        * earlier passes, tallying symbol frequencies is sufficient.  */
-       xpress_set_default_costs(c->costs);
+#if SUPPORT_NEAR_OPTIMAL_PARSING
  
  
-       next_chosen_item_ptr = NULL;
-       do {
-               /* Reset the match-finder wrapper.  */
-               c->cache_ptr = c->cached_matches;
+/*
+ * Set Huffman symbol costs for the first optimization pass.
+ *
+ * It works well to assume that each Huffman symbol is equally probable.  This
+ * results in each symbol being assigned a cost of -log2(1.0/num_syms) where
+ * 'num_syms' is the number of symbols in the alphabet.
+ */
+static void
+xpress_set_default_costs(struct xpress_compressor *c)
+{
+       for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i++)
+               c->costs[i] = 9;
+}
  
  
-               if (num_passes_remaining == 1) {
-                       /* Last pass: actually generate the items.  */
-                       next_chosen_item = c->chosen_items;
-                       next_chosen_item_ptr = &next_chosen_item;
-               }
+/* Update the cost model based on the codeword lengths @c->lens.  */
+static void
+xpress_update_costs(struct xpress_compressor *c)
+{
+       for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i++)
+               c->costs[i] = c->lens[i] ? c->lens[i] : XPRESS_MAX_CODEWORD_LEN;
+}
  
  
-               /* Choose the items.  */
-               xpress_optim_pass(c, next_chosen_item_ptr);
+/*
+ * Follow the minimum cost path in the graph of possible match/literal choices
+ * and compute the frequencies of the Huffman symbols that are needed to output
+ * those matches and literals.
+ */
+static void
+xpress_tally_item_list(struct xpress_compressor *c,
+                      struct xpress_optimum_node *end_optimum_ptr)
+{
+       struct xpress_optimum_node *cur_optimum_ptr = c->optimum_nodes;
  
  
-               if (num_passes_remaining > 1) {
-                       /* This isn't the last pass.  */
+       do {
+               unsigned length = cur_optimum_ptr->item & OPTIMUM_LEN_MASK;
+               unsigned offset = cur_optimum_ptr->item >> OPTIMUM_OFFSET_SHIFT;
  
  
-                       /* Make the Huffman code from the symbol frequencies.  */
-                       c->freqs[XPRESS_END_OF_DATA]++;
-                       xpress_make_huffman_code(c);
+               if (length == 1) {
+                       /* Literal  */
+                       unsigned literal = offset;
  
  
-                       /* Reset symbol frequencies.  */
-                       memset(c->freqs, 0, sizeof(c->freqs));
+                       c->freqs[literal]++;
+               } else {
+                       /* Match  */
+                       unsigned adjusted_len;
+                       unsigned offset_high_bit;
+                       unsigned len_hdr;
+                       unsigned sym;
  
  
-                       /* Update symbol costs.  */
-                       xpress_set_costs(c->costs, c->lens);
+                       adjusted_len = length - XPRESS_MIN_MATCH_LEN;
+                       offset_high_bit = fls32(offset);
+                       len_hdr = min(0xF, adjusted_len);
+                       sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
  
  
-                       /* Choose appopriate match-finder wrapper functions.  */
-                       if (c->cache_ptr <= c->cache_limit) {
-                               c->get_matches_func = xpress_get_matches_usecache_nocheck;
-                               c->skip_bytes_func = xpress_skip_bytes_usecache_nocheck;
-                       } else {
-                               c->get_matches_func = xpress_get_matches_usecache;
-                               c->skip_bytes_func = xpress_skip_bytes_usecache;
-                       }
+                       c->freqs[sym]++;
                 }
                 }
-       } while (--num_passes_remaining);
-
-       /* Return the number of items chosen.  */
-       return next_chosen_item - c->chosen_items;
+               cur_optimum_ptr += length;
+       } while (cur_optimum_ptr != end_optimum_ptr);
  }
  
  }
  
-/* Lazy parsing  */
-static u32
-xpress_choose_lazy_items(struct xpress_compressor *c)
+/*
+ * Find a new minimum cost path through the graph of possible match/literal
+ * choices.  We find the minimum cost path from 'c->optimum_nodes[0]', which
+ * represents the node at the beginning of the input buffer, to
+ * 'c->optimum_nodes[in_nbytes]', which represents the node at the end of the
+ * input buffer.  Edge costs are evaluated using the cost model 'c->costs'.
+ *
+ * The algorithm works backward, starting at 'c->optimum_nodes[in_nbytes]' and
+ * proceeding backwards one position at a time.  At each position, the minimum
+ * cost to reach 'c->optimum_nodes[in_nbytes]' from that position is computed
+ * and the match/literal choice is saved.
+ */
+static void
+xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes,
+                         struct lz_match *end_cache_ptr)
  {
  {
-       const u8 *window_ptr = c->cur_window;
-       const u8 *window_end = &c->cur_window[c->cur_window_size];
-       struct xpress_item *next_chosen_item = c->chosen_items;
-       u32 len_3_too_far;
-       struct lz_mf *mf = c->mf;
-       struct lz_match *matches = c->cached_matches;
-       unsigned num_matches;
-       struct lz_match prev_match;
-
-       if (c->cur_window_size <= 8192)
-               len_3_too_far = 2048;
-       else
-               len_3_too_far = 4096;
+       struct xpress_optimum_node *cur_optimum_ptr = c->optimum_nodes + in_nbytes;
+       struct lz_match *cache_ptr = end_cache_ptr;
  
  
+       cur_optimum_ptr->cost_to_end = 0;
         do {
         do {
-               /* Don't have match at previous position  */
+               unsigned literal;
+               u32 best_item;
+               u32 best_cost_to_end;
+               unsigned num_matches;
+               struct lz_match *match;
+               unsigned len;
  
  
-               num_matches = lz_mf_get_matches(mf, matches);
-               window_ptr++;
+               cur_optimum_ptr--;
+               cache_ptr--;
  
  
-               if (num_matches == 0 ||
-                   (matches[num_matches - 1].len == 3 &&
-                    matches[num_matches - 1].offset >= len_3_too_far))
-               {
-                       /* No matches found => output literal  */
-                       xpress_declare_literal(c, *(window_ptr - 1),
-                                              &next_chosen_item);
-                       continue;
-               }
+               literal = cache_ptr->offset;
  
  
-               prev_match = matches[num_matches - 1];
+               /* Consider coding a literal.  */
+               best_item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
+               best_cost_to_end = c->costs[literal] +
+                                  (cur_optimum_ptr + 1)->cost_to_end;
  
  
-       have_prev_match:
-               /* Have match at previous position  */
+               num_matches = cache_ptr->length;
  
  
-               if (prev_match.len >= c->params.nice_match_length) {
-                       /* Very long match found => output immediately  */
-                       xpress_declare_match(c, prev_match.len,
-                                            prev_match.offset,
-                                            &next_chosen_item);
-                       lz_mf_skip_positions(mf, prev_match.len - 1);
-                       window_ptr += prev_match.len - 1;
+               if (num_matches == 0) {
+                       /* No matches; the only choice is the literal.  */
+                       cur_optimum_ptr->cost_to_end = best_cost_to_end;
+                       cur_optimum_ptr->item = best_item;
                         continue;
                 }
  
                         continue;
                 }
  
-               num_matches = lz_mf_get_matches(mf, matches);
-               window_ptr++;
-
-               if (num_matches == 0 ||
-                   (matches[num_matches - 1].len <= prev_match.len))
-               {
-                       /* Next match is not longer => output previous match  */
-                       xpress_declare_match(c, prev_match.len,
-                                            prev_match.offset,
-                                            &next_chosen_item);
-                       lz_mf_skip_positions(mf, prev_match.len - 2);
-                       window_ptr += prev_match.len - 2;
-                       continue;
+               /*
+                * Consider each match length from the minimum
+                * (XPRESS_MIN_MATCH_LEN) to the length of the longest match
+                * found at this position.  For each length, consider only the
+                * smallest offset for which that length is available.  Although
+                * this is not guaranteed to be optimal due to the possibility
+                * of a larger offset costing less than a smaller offset to
+                * code, this is a very useful heuristic.
+                */
+               match = cache_ptr - num_matches;
+               len = XPRESS_MIN_MATCH_LEN;
+               if (cache_ptr[-1].length < 0xF + XPRESS_MIN_MATCH_LEN) {
+                       /* All lengths are small.  Optimize accordingly.  */
+                       do {
+                               unsigned offset;
+                               unsigned offset_high_bit;
+                               u32 offset_cost;
+
+                               offset = match->offset;
+                               offset_high_bit = fls32(offset);
+                               offset_cost = offset_high_bit;
+                               do {
+                                       unsigned len_hdr;
+                                       unsigned sym;
+                                       u32 cost_to_end;
+
+                                       len_hdr = len - XPRESS_MIN_MATCH_LEN;
+                                       sym = XPRESS_NUM_CHARS +
+                                             ((offset_high_bit << 4) | len_hdr);
+                                       cost_to_end =
+                                               offset_cost + c->costs[sym] +
+                                               (cur_optimum_ptr + len)->cost_to_end;
+                                       if (cost_to_end < best_cost_to_end) {
+                                               best_cost_to_end = cost_to_end;
+                                               best_item =
+                                                       ((u32)offset <<
+                                                        OPTIMUM_OFFSET_SHIFT) | len;
+                                       }
+                               } while (++len <= match->length);
+                       } while (++match != cache_ptr);
+               } else {
+                       /* Some lengths are big.  */
+                       do {
+                               unsigned offset;
+                               unsigned offset_high_bit;
+                               u32 offset_cost;
+
+                               offset = match->offset;
+                               offset_high_bit = fls32(offset);
+                               offset_cost = offset_high_bit;
+                               do {
+                                       unsigned adjusted_len;
+                                       unsigned len_hdr;
+                                       unsigned sym;
+                                       u32 cost_to_end;
+
+                                       adjusted_len = len - XPRESS_MIN_MATCH_LEN;
+                                       len_hdr = min(adjusted_len, 0xF);
+                                       sym = XPRESS_NUM_CHARS +
+                                             ((offset_high_bit << 4) | len_hdr);
+                                       cost_to_end =
+                                               offset_cost + c->costs[sym] +
+                                               (cur_optimum_ptr + len)->cost_to_end;
+                                       if (adjusted_len >= 0xF) {
+                                               cost_to_end += 8;
+                                               if (adjusted_len - 0xF >= 0xFF)
+                                                       cost_to_end += 16;
+                                       }
+                                       if (cost_to_end < best_cost_to_end) {
+                                               best_cost_to_end = cost_to_end;
+                                               best_item =
+                                                       ((u32)offset <<
+                                                        OPTIMUM_OFFSET_SHIFT) | len;
+                                       }
+                               } while (++len <= match->length);
+                       } while (++match != cache_ptr);
                 }
                 }
-
-               /* Next match is longer => output literal  */
-
-               xpress_declare_literal(c, *(window_ptr - 2), &next_chosen_item);
-
-               prev_match = matches[num_matches - 1];
-
-               goto have_prev_match;
-
-       } while (window_ptr != window_end);
-
-       return next_chosen_item - c->chosen_items;
+               cache_ptr -= num_matches;
+               cur_optimum_ptr->cost_to_end = best_cost_to_end;
+               cur_optimum_ptr->item = best_item;
+       } while (cur_optimum_ptr != c->optimum_nodes);
  }
  
  }
  
-/* Greedy parsing  */
-static u32
-xpress_choose_greedy_items(struct xpress_compressor *c)
+/*
+ * This routine finds matches at each position in the buffer in[0...in_nbytes].
+ * The matches are cached in the array c->match_cache, and the return value is a
+ * pointer past the last slot in this array that was filled.
+ */
+static struct lz_match *
+xpress_find_matches(struct xpress_compressor * restrict c,
+                   const void * restrict in, size_t in_nbytes)
  {
  {
-       const u8 *window_ptr = c->cur_window;
-       const u8 *window_end = &c->cur_window[c->cur_window_size];
-       struct xpress_item *next_chosen_item = c->chosen_items;
-       u32 len_3_too_far;
-       struct lz_mf *mf = c->mf;
-       struct lz_match *matches = c->cached_matches;
-       unsigned num_matches;
+       const u8 * const in_base = in;
+       const u8 *in_next = in_base;
+       const u8 * const in_end = in_base + in_nbytes;
+       struct lz_match *cache_ptr = c->match_cache;
+       unsigned long prev_hash = 0;
  
  
-       if (c->cur_window_size <= 8192)
-               len_3_too_far = 2048;
-       else
-               len_3_too_far = 4096;
+       bt_matchfinder_init(&c->bt_mf);
  
         do {
  
         do {
-               /* Get longest match at the current position.  */
-               num_matches = lz_mf_get_matches(mf, matches);
+               unsigned num_matches;
  
  
-               if (num_matches == 0 ||
-                   (matches[num_matches - 1].len == 3 &&
-                    matches[num_matches - 1].offset >= len_3_too_far))
-               {
-                       /* No match, or length 3 match with large offset.
-                        * Choose a literal.  */
-                       xpress_declare_literal(c, *window_ptr, &next_chosen_item);
-                       window_ptr += 1;
-               } else {
-                       /* Match found.  Choose it.  */
-                       unsigned len = matches[num_matches - 1].len;
-                       unsigned offset = matches[num_matches - 1].offset;
-
-                       xpress_declare_match(c, len, offset, &next_chosen_item);
-                       lz_mf_skip_positions(mf, len - 1);
-                       window_ptr += len;
+               /* If we've found so many matches that the cache might overflow
+                * if we keep finding more, then stop finding matches.  This
+                * case is very unlikely.  */
+               if (unlikely(cache_ptr >= c->cache_overflow_mark)) {
+                       do {
+                               cache_ptr->length = 0;
+                               cache_ptr->offset = *in_next++;
+                               cache_ptr++;
+                       } while (in_next != in_end);
+                       return cache_ptr;
                 }
                 }
-       } while (window_ptr != window_end);
-
-       return next_chosen_item - c->chosen_items;
-}
  
  
-/* Literals-only parsing  */
-static u32
-xpress_choose_literals(struct xpress_compressor *c)
-{
-       const u8 *window_ptr = c->cur_window;
-       const u8 *window_end = &c->cur_window[c->cur_window_size];
-       struct xpress_item *next_chosen_item = c->chosen_items;
+               /* Find matches with the current position using the binary tree
+                * matchfinder and save them in the next available slots in
+                * the match cache.  */
+               num_matches =
+                       bt_matchfinder_get_matches(&c->bt_mf,
+                                                  in_base,
+                                                  in_next,
+                                                  XPRESS_MIN_MATCH_LEN,
+                                                  in_end - in_next,
+                                                  min(in_end - in_next, c->nice_match_length),
+                                                  c->max_search_depth,
+                                                  &prev_hash,
+                                                  cache_ptr);
+               cache_ptr += num_matches;
+               cache_ptr->length = num_matches;
+               cache_ptr->offset = *in_next;
+               in_next++;
+               cache_ptr++;
  
  
-       do {
-               xpress_declare_literal(c, *window_ptr++, &next_chosen_item);
-       } while (window_ptr != window_end);
+               if (num_matches) {
+                       /*
+                        * If there was a very long match found, then don't
+                        * cache any matches for the bytes covered by that
+                        * match.  This avoids degenerate behavior when
+                        * compressing highly redundant data, where the number
+                        * of matches can be very large.
+                        *
+                        * This heuristic doesn't actually hurt the compression
+                        * ratio very much.  If there's a long match, then the
+                        * data must be highly compressible, so it doesn't
+                        * matter as much what we do.
+                        */
+                       unsigned best_len = cache_ptr[-2].length;
+                       if (best_len >= c->nice_match_length) {
+                               --best_len;
+                               do {
+                                       bt_matchfinder_skip_position(&c->bt_mf,
+                                                                    in_base,
+                                                                    in_next,
+                                                                    in_end,
+                                                                    min(in_end - in_next,
+                                                                        c->nice_match_length),
+                                                                    c->max_search_depth,
+                                                                    &prev_hash);
+
+                                       cache_ptr->length = 0;
+                                       cache_ptr->offset = *in_next++;
+                                       cache_ptr++;
+                               } while (--best_len);
+                       }
+               }
+       } while (in_next != in_end);
  
  
-       return next_chosen_item - c->chosen_items;
+       return cache_ptr;
  }
  
  /*
  }
  
  /*
- * 'choose_items_func' is provided a data buffer c->cur_window of length
- * c->cur_window_size bytes.  This data buffer will have already been loaded
- * into the match-finder c->mf.  'choose_items_func' must choose the
- * match/literal sequence to output to represent this data buffer.  The
- * intermediate representation of this match/literal sequence must be recorded
- * in c->chosen_items, and the Huffman symbols used must be tallied in c->freqs.
- * The return value must be the number of items written to c->chosen_items.
+ * This is the "near-optimal" XPRESS compressor.  It computes a compressed
+ * representation of the input buffer by executing a minimum cost path search
+ * over the graph of possible match/literal choices, assuming a certain cost for
+ * each Huffman symbol.  The result is usually close to optimal, but it is *not*
+ * guaranteed to be optimal because of (a) heuristic restrictions in which
+ * matches are considered, and (b) symbol costs are unknown until those symbols
+ * have already been chosen --- so iterative optimization must be used, and the
+ * algorithm might converge on a local optimum rather than a global optimum.
   */
   */
-static u32
-xpress_choose_items(struct xpress_compressor *c)
+static size_t
+xpress_compress_near_optimal(struct xpress_compressor * restrict c,
+                            const void * restrict in, size_t in_nbytes,
+                            void * restrict out, size_t out_nbytes_avail)
  {
  {
-       return (*c->params.choose_items_func)(c);
-}
+       struct lz_match *end_cache_ptr;
+       unsigned num_passes_remaining = c->num_optim_passes;
  
  
-/* Set internal compression parameters for the specified compression level and
- * maximum window size.  */
-static void
-xpress_build_params(unsigned int compression_level, u32 max_window_size,
-                   struct xpress_compressor_params *xpress_params)
-{
-       memset(xpress_params, 0, sizeof(*xpress_params));
-       xpress_params->num_optim_passes = 1;
-
-       if (compression_level == 1) {
-
-               /* Literal-only parsing  */
-               xpress_params->choose_items_func = xpress_choose_literals;
-               xpress_params->mf_algo = LZ_MF_NULL;
-
-       } else if (compression_level < 30) {
-
-               /* Greedy parsing  */
-               xpress_params->choose_items_func = xpress_choose_greedy_items;
-               xpress_params->mf_algo = LZ_MF_HASH_CHAINS;
-               xpress_params->nice_match_length = compression_level;
-               xpress_params->max_search_depth = compression_level / 2;
-
-       } else if (compression_level < 60) {
-
-               /* Lazy parsing  */
-               xpress_params->choose_items_func = xpress_choose_lazy_items;
-               xpress_params->mf_algo = LZ_MF_HASH_CHAINS;
-               xpress_params->nice_match_length = compression_level;
-               xpress_params->max_search_depth = compression_level / 2;
-
-       } else {
-
-               /* Near-optimal parsing  */
-               xpress_params->choose_items_func = xpress_choose_near_optimal_items;
-               if (max_window_size >= 16384)
-                       xpress_params->mf_algo = LZ_MF_BINARY_TREES;
-               else
-                       xpress_params->mf_algo = LZ_MF_HASH_CHAINS;
-               xpress_params->num_optim_passes = compression_level / 40;
-               xpress_params->nice_match_length = min(compression_level / 2,
-                                                      XPRESS_MAX_MATCH_LEN);
-               xpress_params->max_search_depth = min(compression_level,
-                                                     XPRESS_MAX_MATCH_LEN);
-       }
-}
+       /* Run the input buffer through the matchfinder and save the results. */
+       end_cache_ptr = xpress_find_matches(c, in, in_nbytes);
  
  
-/* Given the internal compression parameters and maximum window size, build the
- * Lempel-Ziv match-finder parameters.  */
-static void
-xpress_build_mf_params(const struct xpress_compressor_params *xpress_params,
-                      u32 max_window_size, struct lz_mf_params *mf_params)
-{
-       memset(mf_params, 0, sizeof(*mf_params));
-
-       mf_params->algorithm = xpress_params->mf_algo;
-       mf_params->max_window_size = max_window_size;
-       mf_params->min_match_len = XPRESS_MIN_MATCH_LEN;
-       mf_params->max_match_len = XPRESS_MAX_MATCH_LEN;
-       mf_params->max_search_depth = xpress_params->max_search_depth;
-       mf_params->nice_match_len = xpress_params->nice_match_length;
+       /* The first optimization pass uses a default cost model.  Each
+        * additional optimization pass uses a cost model derived from the
+        * Huffman code computed in the previous pass.  */
+       xpress_set_default_costs(c);
+       do {
+               xpress_find_min_cost_path(c, in_nbytes, end_cache_ptr);
+               xpress_tally_item_list(c, c->optimum_nodes + in_nbytes);
+               if (num_passes_remaining > 1) {
+                       c->freqs[XPRESS_END_OF_DATA]++;
+                       xpress_make_huffman_code(c);
+                       xpress_update_costs(c);
+                       xpress_reset_symbol_frequencies(c);
+               }
+       } while (--num_passes_remaining);
+
+       return xpress_write(c, out, out_nbytes_avail, in_nbytes, true);
  }
  
  }
  
-static void
-xpress_free_compressor(void *_c);
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
  
  static u64
  
  static u64
-xpress_get_needed_memory(size_t max_window_size, unsigned int compression_level)
+xpress_get_needed_memory(size_t max_bufsize, unsigned compression_level)
  {
  {
-       u64 size = 0;
-       struct xpress_compressor_params params;
+       size_t size = 0;
  
  
-       if (max_window_size > XPRESS_MAX_OFFSET + 1)
+       if (max_bufsize > XPRESS_MAX_BUFSIZE)
                 return 0;
  
                 return 0;
  
-       xpress_build_params(compression_level, max_window_size, &params);
-
-       size += sizeof(struct xpress_compressor);
-
-       /* mf */
-       size += lz_mf_get_needed_memory(params.mf_algo, max_window_size);
-
-       /* optimum */
-       if (params.choose_items_func == xpress_choose_near_optimal_items) {
-               size += (XPRESS_OPTIM_ARRAY_LENGTH + params.nice_match_length) *
-                       sizeof(struct xpress_mc_pos_data);
+       if (compression_level < MIN_LEVEL_FOR_NEAR_OPTIMAL ||
+           !SUPPORT_NEAR_OPTIMAL_PARSING) {
+               size += offsetof(struct xpress_compressor, nonoptimal_end);
+               size += max_bufsize * sizeof(struct xpress_item);
         }
         }
-
-       /* cached_matches */
-       if (params.num_optim_passes > 1) {
-               size_t cache_len = max(max_window_size * XPRESS_CACHE_PER_POS,
-                                      params.max_search_depth + 1);
-               size += cache_len * sizeof(struct lz_match);
-       } else {
-               size += params.max_search_depth * sizeof(struct lz_match);
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+       else {
+               size += offsetof(struct xpress_compressor, optimal_end);
+               size += (max_bufsize + 1) * sizeof(struct xpress_optimum_node);
+               size += ((max_bufsize * CACHE_RESERVE_PER_POS) +
+                        XPRESS_MAX_MATCH_LEN + max_bufsize) *
+                               sizeof(struct lz_match);
         }
         }
-
-       /* chosen_items */
-       size += max_window_size * sizeof(struct xpress_item);
-
+#endif
         return size;
  }
  
  static int
         return size;
  }
  
  static int
-xpress_create_compressor(size_t max_window_size, unsigned int compression_level,
+xpress_create_compressor(size_t max_bufsize, unsigned compression_level,
                          void **c_ret)
  {
         struct xpress_compressor *c;
                          void **c_ret)
  {
         struct xpress_compressor *c;
-       struct xpress_compressor_params params;
-       struct lz_mf_params mf_params;
  
  
-       if (max_window_size > XPRESS_MAX_OFFSET + 1)
+       if (max_bufsize > XPRESS_MAX_BUFSIZE)
                 return WIMLIB_ERR_INVALID_PARAM;
  
                 return WIMLIB_ERR_INVALID_PARAM;
  
-       xpress_build_params(compression_level, max_window_size, &params);
-       xpress_build_mf_params(&params, max_window_size, &mf_params);
-
-       c = CALLOC(1, sizeof(struct xpress_compressor));
-       if (!c)
-               goto oom;
-
-       c->params = params;
-
-       c->mf = lz_mf_alloc(&mf_params);
-       if (!c->mf)
-               goto oom;
-
-       if (params.choose_items_func == xpress_choose_near_optimal_items) {
-               c->optimum = MALLOC((XPRESS_OPTIM_ARRAY_LENGTH +
-                                    params.nice_match_length) *
-                                     sizeof(struct xpress_mc_pos_data));
-               if (!c->optimum)
-                       goto oom;
+       if (compression_level < 30) {
+               c = ALIGNED_MALLOC(offsetof(struct xpress_compressor,
+                                           nonoptimal_end),
+                                  MATCHFINDER_ALIGNMENT);
+               if (!c)
+                       return WIMLIB_ERR_NOMEM;
+               c->impl = xpress_compress_greedy;
+               c->max_search_depth = (compression_level * 24) / 16;
+               c->nice_match_length = (compression_level * 48) / 16;
+               c->chosen_items = MALLOC(max_bufsize * sizeof(struct xpress_item));
+               if (!c->chosen_items) {
+                       ALIGNED_FREE(c);
+                       return WIMLIB_ERR_NOMEM;
+               }
+       } else if (compression_level < MIN_LEVEL_FOR_NEAR_OPTIMAL ||
+                  !SUPPORT_NEAR_OPTIMAL_PARSING)
+       {
+               c = ALIGNED_MALLOC(offsetof(struct xpress_compressor,
+                                           nonoptimal_end),
+                                  MATCHFINDER_ALIGNMENT);
+               if (!c)
+                       return WIMLIB_ERR_NOMEM;
+
+               c->impl = xpress_compress_lazy;
+               c->max_search_depth = (compression_level * 24) / 32;
+               c->nice_match_length = (compression_level * 48) / 32;
+               c->chosen_items = MALLOC(max_bufsize * sizeof(struct xpress_item));
+               if (!c->chosen_items) {
+                       ALIGNED_FREE(c);
+                       return WIMLIB_ERR_NOMEM;
+               }
         }
         }
-
-       if (params.num_optim_passes > 1) {
-               size_t cache_len = max(max_window_size * XPRESS_CACHE_PER_POS,
-                                      params.max_search_depth + 1);
-               c->cached_matches = MALLOC(cache_len * sizeof(struct lz_match));
-               if (!c->cached_matches)
-                       goto oom;
-               c->cache_limit = c->cached_matches + cache_len -
-                                  (params.max_search_depth + 1);
-       } else {
-               c->cached_matches = MALLOC(params.max_search_depth *
-                                          sizeof(struct lz_match));
-               if (!c->cached_matches)
-                       goto oom;
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+       else {
+               c = ALIGNED_MALLOC(offsetof(struct xpress_compressor,
+                                           optimal_end),
+                                  MATCHFINDER_ALIGNMENT);
+               if (!c)
+                       return WIMLIB_ERR_NOMEM;
+               c->impl = xpress_compress_near_optimal;
+               c->max_search_depth = (compression_level * 32) / 100;
+               c->nice_match_length = (compression_level * 50) / 100;
+               c->num_optim_passes = compression_level / 40;
+
+               c->optimum_nodes = MALLOC((max_bufsize + 1) *
+                                         sizeof(struct xpress_optimum_node));
+               c->match_cache = MALLOC(((max_bufsize * CACHE_RESERVE_PER_POS) +
+                                        XPRESS_MAX_MATCH_LEN + max_bufsize) *
+                                       sizeof(struct lz_match));
+               if (!c->optimum_nodes || !c->match_cache) {
+                       FREE(c->optimum_nodes);
+                       FREE(c->match_cache);
+                       ALIGNED_FREE(c);
+                       return WIMLIB_ERR_NOMEM;
+               }
+               c->cache_overflow_mark =
+                       &c->match_cache[max_bufsize * CACHE_RESERVE_PER_POS];
         }
         }
-
-       c->chosen_items = MALLOC(max_window_size * sizeof(struct xpress_item));
-       if (!c->chosen_items)
-               goto oom;
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
  
         *c_ret = c;
         return 0;
  
         *c_ret = c;
         return 0;
-
-oom:
-       xpress_free_compressor(c);
-       return WIMLIB_ERR_NOMEM;
  }
  
  static size_t
  }
  
  static size_t
-xpress_compress(const void *uncompressed_data, size_t uncompressed_size,
-               void *compressed_data, size_t compressed_size_avail, void *_c)
+xpress_compress(const void *in, size_t in_nbytes,
+               void *out, size_t out_nbytes_avail, void *_c)
  {
         struct xpress_compressor *c = _c;
  {
         struct xpress_compressor *c = _c;
-       u32 num_chosen_items;
-       u8 *cptr;
-       struct xpress_output_bitstream os;
-       u32 compressed_size;
  
  
-       /* XPRESS requires 256 bytes of overhead for the Huffman code, so it's
-        * impossible to compress 256 bytes or less of data to less than the
-        * input size.  */
-       if (compressed_size_avail < XPRESS_NUM_SYMBOLS / 2 + 50)
+       if (out_nbytes_avail <= XPRESS_NUM_SYMBOLS / 2 + 4)
                 return 0;
  
                 return 0;
  
-       /* Determine match/literal sequence.  */
-       c->cur_window = uncompressed_data;
-       c->cur_window_size = uncompressed_size;
-       lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size);
-       memset(c->freqs, 0, sizeof(c->freqs));
-
-       num_chosen_items = xpress_choose_items(c);
-
-       c->freqs[XPRESS_END_OF_DATA]++;
-       xpress_make_huffman_code(c);
-
-       /* Output the Huffman code as a series of 512 4-bit lengths.  */
-       cptr = compressed_data;
-       for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i += 2)
-               *cptr++ = (c->lens[i + 1] << 4) | c->lens[i];
-
-       /* Output the encoded matches/literals.  */
-       xpress_init_output(&os, cptr,
-                          compressed_size_avail - XPRESS_NUM_SYMBOLS / 2);
-       xpress_write_items(&os, c->chosen_items, num_chosen_items,
-                          c->codewords, c->lens);
-
-       /* Flush any pending data and get the length of the compressed data.  */
-       compressed_size = xpress_flush_output(&os);
-       if (compressed_size == 0)
-               return 0;
+       xpress_reset_symbol_frequencies(c);
  
  
-       /* Return the length of the compressed data.  */
-       return compressed_size + XPRESS_NUM_SYMBOLS / 2;
+       return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
  }
  
  static void
  }
  
  static void
@@ -1240,11 +1137,14 @@ xpress_free_compressor(void *_c)
         struct xpress_compressor *c = _c;
  
         if (c) {
         struct xpress_compressor *c = _c;
  
         if (c) {
-               lz_mf_free(c->mf);
-               FREE(c->optimum);
-               FREE(c->cached_matches);
-               FREE(c->chosen_items);
-               FREE(c);
+       #if SUPPORT_NEAR_OPTIMAL_PARSING
+               if (c->impl == xpress_compress_near_optimal) {
+                       FREE(c->optimum_nodes);
+                       FREE(c->match_cache);
+               } else
+       #endif
+                       FREE(c->chosen_items);
+               ALIGNED_FREE(c);
         }
  }
  
         }
  }
author	Eric Biggers <ebiggers3@gmail.com>
	Tue, 23 Dec 2014 05:16:38 +0000 (23:16 -0600)
committer	Eric Biggers <ebiggers3@gmail.com>
	Tue, 23 Dec 2014 05:36:49 +0000 (23:36 -0600)
Makefile.am		patch \| blob \| history
NEWS		patch \| blob \| history
include/wimlib/bt_matchfinder.h	[new file with mode: 0644]	patch \| blob
include/wimlib/hc_matchfinder.h	[new file with mode: 0644]	patch \| blob
include/wimlib/lz_hash3.h		patch \| blob \| history
include/wimlib/matchfinder_avx2.h	[new file with mode: 0644]	patch \| blob
include/wimlib/matchfinder_common.h	[new file with mode: 0644]	patch \| blob
include/wimlib/matchfinder_nonsliding.h	[new file with mode: 0644]	patch \| blob
include/wimlib/matchfinder_sliding.h	[new file with mode: 0644]	patch \| blob
include/wimlib/matchfinder_sse2.h	[new file with mode: 0644]	patch \| blob
src/xpress-compress.c		patch \| blob \| history