]> wimlib.net Git - wimlib/commitdiff
Compression updates
authorEric Biggers <ebiggers3@gmail.com>
Sun, 14 Sep 2014 05:14:07 +0000 (00:14 -0500)
committerEric Biggers <ebiggers3@gmail.com>
Sun, 14 Sep 2014 05:37:31 +0000 (00:37 -0500)
- Faster searching for repeat offset matches
- Lazy updates of adaptive state in near-optimal parsing
- Faster LZX near-optimal parsing
- Better LZX lazy parsing
- Faster XPRESS near-optimal parsing
- Faster LZMS near-optimal parsing
- Faster LZMS match/literal output
- Slightly faster LZMS decompression
- Cleanups

15 files changed:
Makefile.am
NEWS
README
include/wimlib/lz_repsearch.h
include/wimlib/lzms.h
include/wimlib/lzms_constants.h [new file with mode: 0644]
include/wimlib/lzx.h
include/wimlib/lzx_constants.h
src/lzms-common.c
src/lzms-compress.c
src/lzms-decompress.c
src/lzx-common.c
src/lzx-compress.c
src/lzx-decompress.c
src/xpress-compress.c

index cff4aa8993c060d56446ebe8f7be6d30a1b61fdd..8f6277dbdcd5992241e46ac68417378227ebe3c0 100644 (file)
@@ -110,6 +110,7 @@ libwim_la_SOURCES =         \
        include/wimlib/lz_repsearch.h   \
        include/wimlib/lz_suffix_array_utils.h  \
        include/wimlib/lzms.h           \
        include/wimlib/lz_repsearch.h   \
        include/wimlib/lz_suffix_array_utils.h  \
        include/wimlib/lzms.h           \
+       include/wimlib/lzms_constants.h \
        include/wimlib/lzx.h            \
        include/wimlib/lzx_constants.h  \
        include/wimlib/metadata.h       \
        include/wimlib/lzx.h            \
        include/wimlib/lzx_constants.h  \
        include/wimlib/metadata.h       \
diff --git a/NEWS b/NEWS
index 70ebfb4f99594cae81478888fee0f80721d30569..1c13bfafe8895b0e9d27b6b7b3289fe621fd66df 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,5 @@
 Version 1.7.2-BETA:
 Version 1.7.2-BETA:
-       More compression performance improvements.
+       Made more improvements to the XPRESS, LZX, and LZMS compressors.
 
        Fixes for setting short names on Windows.
 
 
        Fixes for setting short names on Windows.
 
diff --git a/README b/README
index 95f990d9cf0d5a9e1c5f0ac811b8e58eade35b51..419c3da6d6cd81e6a464cd59da8a8eab378b1c3b 100644 (file)
--- a/README
+++ b/README
@@ -78,18 +78,18 @@ create the file.  When applicable, the results with the equivalent Microsoft
 implementation in WIMGAPI is included.
 
   =============================================================================
 implementation in WIMGAPI is included.
 
   =============================================================================
-  | Compression            ||  wimlib (v1.7.2-BETA)  |  WIMGAPI (Windows 8.1) |
+  | Compression            ||  wimlib (v1.7.2)       |  WIMGAPI (Windows 8.1) |
   =============================================================================
   =============================================================================
-  | None             [1]   ||  361,404,682 in 3.4s   |  361,364,994 in 4.2s   |
-  | XPRESS           [2]   ||  138,398,747 in 4.2s   |  140,468,002 in 5.1s   |
-  | XPRESS (slow)    [3]   ||  135,284,950 in 10.3s  |  N/A                   |
-  | LZX (quick)      [4]   ||  131,861,913 in 4.7s   |  N/A                   |
-  | LZX (normal)     [5]   ||  126,855,247 in 14.9s  |  127,301,774 in 18.2s  |
-  | LZX (slow)       [6]   ||  126,245,561 in 32.1s  |  N/A                   |
-  | LZMS (non-solid) [7]   ||  122,126,328 in 16.4s  |  N/A                   |
-  | LZMS (solid)     [8]   ||  93,795,440  in 47.4s  |  88,789,426 in 96.8s   |
-  | "WIMBoot"        [9]   ||  167,121,495 in 5.3s   |  169,124,968 in 9.3s   |
-  | "WIMBoot" (slow) [10]  ||  165,219,818 in 9.4s   |  N/A                   |
+  | None             [1]   ||  361,314,224 in 3.4s   |  361,315,338 in 4.5s   |
+  | XPRESS           [2]   ||  138,380,918 in 4.2s   |  140,457,487 in 6.3s   |
+  | XPRESS (slow)    [3]   ||  135,269,627 in 11.1s  |  N/A                   |
+  | LZX (quick)      [4]   ||  130,332,081 in 4.7s   |  N/A                   |
+  | LZX (normal)     [5]   ||  126,714,941 in 12.9s  |  127,293,240 in 19.2s  |
+  | LZX (slow)       [6]   ||  126,150,725 in 23.4s  |  N/A                   |
+  | LZMS (non-solid) [7]   ||  121,909,750 in 13.3s  |  N/A                   |
+  | LZMS (solid)     [8]   ||  93,650,894  in 44.4s  |  88,771,192 in 109.2   |
+  | "WIMBoot"        [9]   ||  167,095,369 in 6.4s   |  169,109,650 in 10.7s  |
+  | "WIMBoot" (slow) [10]  ||  165,195,668 in 9.5s   |  N/A                   |
   =============================================================================
 
 Notes:
   =============================================================================
 
 Notes:
@@ -139,36 +139,39 @@ Testing environment:
 
 The compression ratio provided by wimlib is also competitive with commonly used
 archive formats.  Below are file sizes that result when the Canterbury corpus is
 
 The compression ratio provided by wimlib is also competitive with commonly used
 archive formats.  Below are file sizes that result when the Canterbury corpus is
-compressed with wimlib (v1.7.0), WIMGAPI (Windows 8), and some other
+compressed with wimlib (v1.7.2), WIMGAPI (Windows 8.1), and some other
 formats/programs:
 
 formats/programs:
 
-     =================================================
-     | Format                         | Size (bytes) |
-     =================================================
-     | tar                            | 2,826,240    |
-     | WIM (WIMGAPI, None)            | 2,814,278    |
-     | WIM (wimlib, None)             | 2,813,856    |
-     | WIM (WIMGAPI, XPRESS)          | 825,410      |
-     | WIM (wimlib, XPRESS)           | 792,024      |
-     | tar.gz (gzip, default)         | 738,796      |
-     | ZIP (Info-ZIP, default)        | 735,334      |
-     | tar.gz (gzip, -9)              | 733,971      |
-     | ZIP (Info-ZIP, -9)             | 732,297      |
-     | WIM (wimlib, LZX quick)        | 722,196      |
-     | WIM (WIMGAPI, LZX)             | 651,766      |
-     | WIM (wimlib, LZX normal)       | 649,204      |
-     | WIM (wimlib, LZX slow)         | 639,618      |
-     | WIM (wimlib, LZMS non-solid)   | 592,136      |
-     | tar.bz2 (bzip, default)        | 565,008      |
-     | tar.bz2 (bzip, -9)             | 565,008      |
-     | WIM (wimlib, LZMS solid)       | 525,270      |
-     | WIM (wimlib, LZMS solid, slow) | 521,700      |
-     | WIM (WIMGAPI, LZMS solid)      | 521,232      |
-     | tar.xz (xz, default)           | 486,916      |
-     | tar.xz (xz, -9)                | 486,904      |
-     | 7z  (7-zip, default)           | 484,700      |
-     | 7z  (7-zip, -9)                | 483,239      |
-     =================================================
+     =====================================================
+     | Format                             | Size (bytes) |
+     =====================================================
+     | tar                                | 2,826,240    |
+     | WIM (WIMGAPI, None)                | 2,814,254    |
+     | WIM (wimlib, None)                 | 2,814,216    |
+     | WIM (WIMGAPI, XPRESS)              | 825,536      |
+     | WIM (wimlib, XPRESS)               | 790,016      |
+     | tar.gz (gzip, default)             | 738,796      |
+     | ZIP (Info-ZIP, default)            | 735,334      |
+     | tar.gz (gzip, -9)                  | 733,971      |
+     | ZIP (Info-ZIP, -9)                 | 732,297      |
+     | WIM (wimlib, LZX quick)            | 704,006      |
+     | WIM (WIMGAPI, LZX)                 | 651,866      |
+     | WIM (wimlib, LZX normal)           | 632,614      |
+     | WIM (wimlib, LZX slow)             | 625,050      |
+     | WIM (wimlib, LZMS non-solid)       | 581,960      |
+     | tar.bz2 (bzip, default)            | 565,008      |
+     | tar.bz2 (bzip, -9)                 | 565,008      |
+     | WIM (wimlib, LZX solid)            | 532,700      |
+     | WIM (wimlib, LZMS solid)           | 525,990      |
+     | WIM (wimlib, LZX solid, slow)      | 525,140      |
+     | WIM (wimlib, LZMS solid, slow)     | 523,728      |
+     | WIM (WIMGAPI, LZMS solid)          | 521,366      |
+     | WIM (wimlib, LZX solid, very slow) | 520,832      |
+     | tar.xz (xz, default)               | 486,916      |
+     | tar.xz (xz, -9)                    | 486,904      |
+     | 7z  (7-zip, default)               | 484,700      |
+     | 7z  (7-zip, -9)                    | 483,239      |
+     =====================================================
 
 Note: WIM does even better on directory trees containing duplicate files, which
 the Canterbury corpus doesn't have.
 
 Note: WIM does even better on directory trees containing duplicate files, which
 the Canterbury corpus doesn't have.
index fe59558bdb46d29047d8d0657347a8b601cf6445..6883bf624a8eb66be6b5b77b6070ad1b85163a9b 100644 (file)
 #define _LZ_REPSEARCH_H
 
 #include "wimlib/lz_extend.h"
 #define _LZ_REPSEARCH_H
 
 #include "wimlib/lz_extend.h"
-#include "wimlib/util.h"
 
 extern u32
 lz_extend_repmatch(const u8 *strptr, const u8 *matchptr, u32 max_len);
 
 /*
 
 extern u32
 lz_extend_repmatch(const u8 *strptr, const u8 *matchptr, u32 max_len);
 
 /*
- * Find the longest repeat offset match.
+ * Given a pointer to the current string and a queue of 3 recent match offsets,
+ * find the longest repeat offset match.
  *
  * If no match of at least 2 bytes is found, then return 0.
  *
  * If a match of at least 2 bytes is found, then return its length and set
  *
  * If no match of at least 2 bytes is found, then return 0.
  *
  * If a match of at least 2 bytes is found, then return its length and set
- * *slot_ret to the index of its offset in @queue.
- */
+ * *rep_max_idx_ret to the index of its offset in @recent_offsets.
+*/
 static inline u32
 static inline u32
-lz_repsearch(const u8 * const strptr, const u32 bytes_remaining,
-            const u32 max_match_len, const u32 repeat_offsets[],
-            const unsigned num_repeat_offsets, unsigned *slot_ret)
+lz_repsearch3(const u8 * const strptr, const u32 max_len,
+             const u32 recent_offsets[3], unsigned *rep_max_idx_ret)
 {
 {
-       u32 best_len = 0;
-
-       if (likely(bytes_remaining >= 2)) {
-               const u32 max_len = min(max_match_len, bytes_remaining);
-               const u16 str = *(const u16 *)strptr;
-
-               for (unsigned i = 0; i < num_repeat_offsets; i++) {
-                       const u8 * const matchptr = strptr - repeat_offsets[i];
-
-                       /* Check the first two bytes.  If they match, then
-                        * extend the match to its full length.  */
-                       if (*(const u16 *)matchptr == str) {
-                               const u32 len = lz_extend_repmatch(strptr, matchptr, max_len);
-                               if (len > best_len) {
-                                       best_len = len;
-                                       *slot_ret = i;
-                               }
-                       }
+       unsigned rep_max_idx;
+       u32 rep_len;
+       u32 rep_max_len;
+       const u16 str = *(const u16 *)strptr;
+       const u8 *matchptr;
+
+       matchptr = strptr - recent_offsets[0];
+       if (*(const u16 *)matchptr == str)
+               rep_max_len = lz_extend_repmatch(strptr, matchptr, max_len);
+       else
+               rep_max_len = 0;
+       rep_max_idx = 0;
+
+       matchptr = strptr - recent_offsets[1];
+       if (*(const u16 *)matchptr == str) {
+               rep_len = lz_extend_repmatch(strptr, matchptr, max_len);
+               if (rep_len > rep_max_len) {
+                       rep_max_len = rep_len;
+                       rep_max_idx = 1;
                }
        }
                }
        }
-       return best_len;
+
+       matchptr = strptr - recent_offsets[2];
+       if (*(const u16 *)matchptr == str) {
+               rep_len = lz_extend_repmatch(strptr, matchptr, max_len);
+               if (rep_len > rep_max_len) {
+                       rep_max_len = rep_len;
+                       rep_max_idx = 2;
+               }
+       }
+
+       *rep_max_idx_ret = rep_max_idx;
+       return rep_max_len;
 }
 
 #endif /* _LZ_REPSEARCH_H */
 }
 
 #endif /* _LZ_REPSEARCH_H */
index 76381a416829de89773d65de795d788cd7b06b7f..94bcba826f772e1af34a15ca35638964588a5aab 100644 (file)
@@ -1,8 +1,14 @@
+/*
+ * lzms.h
+ *
+ * Declarations shared between LZMS compression and decompression.
+ */
+
 #ifndef _WIMLIB_LZMS_H
 #define _WIMLIB_LZMS_H
 
 #ifndef _WIMLIB_LZMS_H
 #define _WIMLIB_LZMS_H
 
-/* Constants for the LZMS data compression format.  See the comments in
- * lzms-decompress.c for more information about this format.  */
+#include "wimlib/lzms_constants.h"
+#include "wimlib/util.h"
 
 //#define ENABLE_LZMS_DEBUG
 #ifdef ENABLE_LZMS_DEBUG
 
 //#define ENABLE_LZMS_DEBUG
 #ifdef ENABLE_LZMS_DEBUG
 #      define LZMS_ASSERT(...)
 #endif
 
 #      define LZMS_ASSERT(...)
 #endif
 
-#define LZMS_NUM_RECENT_OFFSETS                        3
-#define LZMS_MAX_INIT_RECENT_OFFSET            (LZMS_NUM_RECENT_OFFSETS + 1)
-
-#define LZMS_PROBABILITY_BITS                  6
-#define LZMS_PROBABILITY_MAX                   (1U << LZMS_PROBABILITY_BITS)
-#define LZMS_INITIAL_PROBABILITY               48
-#define LZMS_INITIAL_RECENT_BITS               0x0000000055555555ULL
-
-#define LZMS_NUM_MAIN_STATES                   16
-#define LZMS_NUM_MATCH_STATES                  32
-#define LZMS_NUM_LZ_MATCH_STATES               64
-#define LZMS_NUM_LZ_REPEAT_MATCH_STATES                64
-#define LZMS_NUM_DELTA_MATCH_STATES            64
-#define LZMS_NUM_DELTA_REPEAT_MATCH_STATES     64
-#define LZMS_MAX_NUM_STATES                    64
-
-#define LZMS_NUM_LITERAL_SYMS                  256
-#define LZMS_NUM_LEN_SYMS                      54
-#define LZMS_NUM_DELTA_POWER_SYMS              8
-#define LZMS_MAX_NUM_OFFSET_SYMS               799
-#define LZMS_MAX_NUM_SYMS                      799
-
-#define LZMS_MAX_CODEWORD_LEN                  15
-
-#define LZMS_LITERAL_CODE_REBUILD_FREQ         1024
-#define LZMS_LZ_OFFSET_CODE_REBUILD_FREQ       1024
-#define LZMS_LENGTH_CODE_REBUILD_FREQ          512
-#define LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ    1024
-#define LZMS_DELTA_POWER_CODE_REBUILD_FREQ     512
-
-#define LZMS_X86_MAX_GOOD_TARGET_OFFSET                65535
-#define LZMS_X86_MAX_TRANSLATION_OFFSET                1023
-
-/* Code shared between the LZMS decompressor and compressor.  */
-
-#include <wimlib/util.h>
-
 extern void
 lzms_x86_filter(u8 data[], s32 size, s32 last_target_usages[], bool undo);
 
 extern void
 lzms_x86_filter(u8 data[], s32 size, s32 last_target_usages[], bool undo);
 
@@ -60,9 +29,9 @@ struct lzms_probability_entry {
 
        /* Number of zeroes in the most recent LZMS_PROBABILITY_MAX bits that
         * have been coded using this probability entry.  This is a cached value
 
        /* Number of zeroes in the most recent LZMS_PROBABILITY_MAX bits that
         * have been coded using this probability entry.  This is a cached value
-        * because it can be computed as LZMS_PROBABILITY_MAX minus the Hamming
-        * weight of the low-order LZMS_PROBABILITY_MAX bits of @recent_bits.
-        * */
+        * because it can be computed as LZMS_PROBABILITY_MAX minus the number
+        * of bits set in the low-order LZMS_PROBABILITY_MAX bits of
+        * @recent_bits.  */
        u32 num_recent_zero_bits;
 
        /* The most recent LZMS_PROBABILITY_MAX bits that have been coded using
        u32 num_recent_zero_bits;
 
        /* The most recent LZMS_PROBABILITY_MAX bits that have been coded using
@@ -104,61 +73,45 @@ struct lzms_lru_queues {
         struct lzms_delta_lru_queues delta;
 };
 
         struct lzms_delta_lru_queues delta;
 };
 
-extern u32 lzms_position_slot_base[LZMS_MAX_NUM_OFFSET_SYMS + 1];
-
-extern u8 lzms_extra_position_bits[LZMS_MAX_NUM_OFFSET_SYMS];
-
-extern u16 lzms_order_to_position_slot_bounds[30][2];
+/* Offset slot tables  */
+extern u32 lzms_offset_slot_base[LZMS_MAX_NUM_OFFSET_SYMS + 1];
+extern u8 lzms_extra_offset_bits[LZMS_MAX_NUM_OFFSET_SYMS];
 
 
+/* Length slot tables  */
 extern u32 lzms_length_slot_base[LZMS_NUM_LEN_SYMS + 1];
 extern u32 lzms_length_slot_base[LZMS_NUM_LEN_SYMS + 1];
-
-#define LZMS_NUM_FAST_LENGTHS 1024
-extern u8 lzms_length_slot_fast[LZMS_NUM_FAST_LENGTHS];
-
 extern u8 lzms_extra_length_bits[LZMS_NUM_LEN_SYMS];
 
 extern void
 lzms_init_slots(void);
 
 extern u8 lzms_extra_length_bits[LZMS_NUM_LEN_SYMS];
 
 extern void
 lzms_init_slots(void);
 
-/* Return the slot for the specified value.  */
-extern u32
-lzms_get_slot(u32 value, const u32 slot_base_tab[], u32 num_slots);
+extern unsigned
+lzms_get_slot(u32 value, const u32 slot_base_tab[], unsigned num_slots);
 
 
-static inline u32
-lzms_get_position_slot(u32 position)
+/* Return the offset slot for the specified offset  */
+static inline unsigned
+lzms_get_offset_slot(u32 offset)
 {
 {
-       u32 order = bsr32(position);
-       u32 l = lzms_order_to_position_slot_bounds[order][0];
-       u32 r = lzms_order_to_position_slot_bounds[order][1];
-
-       for (;;) {
-               u32 slot = (l + r) / 2;
-               if (position >= lzms_position_slot_base[slot]) {
-                       if (position < lzms_position_slot_base[slot + 1])
-                               return slot;
-                       else
-                               l = slot + 1;
-               } else {
-                       r = slot - 1;
-               }
-       }
+       return lzms_get_slot(offset, lzms_offset_slot_base, LZMS_MAX_NUM_OFFSET_SYMS);
 }
 
 }
 
-static inline u32
+/* Return the length slot for the specified length  */
+static inline unsigned
 lzms_get_length_slot(u32 length)
 {
 lzms_get_length_slot(u32 length)
 {
-       if (likely(length < LZMS_NUM_FAST_LENGTHS))
-               return lzms_length_slot_fast[length];
-       else
-               return lzms_get_slot(length, lzms_length_slot_base,
-                                    LZMS_NUM_LEN_SYMS);
+       return lzms_get_slot(length, lzms_length_slot_base, LZMS_NUM_LEN_SYMS);
 }
 
 }
 
+extern void
+lzms_init_lz_lru_queues(struct lzms_lz_lru_queues *lz);
+
+extern void
+lzms_init_delta_lru_queues(struct lzms_delta_lru_queues *delta);
+
 extern void
 lzms_init_lru_queues(struct lzms_lru_queues *lru);
 
 extern void
 extern void
 lzms_init_lru_queues(struct lzms_lru_queues *lru);
 
 extern void
-lzms_update_lz_lru_queues(struct lzms_lz_lru_queues *lz);
+lzms_update_lz_lru_queue(struct lzms_lz_lru_queues *lz);
 
 extern void
 lzms_update_delta_lru_queues(struct lzms_delta_lru_queues *delta);
 
 extern void
 lzms_update_delta_lru_queues(struct lzms_delta_lru_queues *delta);
@@ -166,4 +119,36 @@ lzms_update_delta_lru_queues(struct lzms_delta_lru_queues *delta);
 extern void
 lzms_update_lru_queues(struct lzms_lru_queues *lru);
 
 extern void
 lzms_update_lru_queues(struct lzms_lru_queues *lru);
 
+/* Given a decoded bit, update the probability entry.  */
+static inline void
+lzms_update_probability_entry(struct lzms_probability_entry *prob_entry, int bit)
+{
+       s32 delta_zero_bits;
+
+       BUILD_BUG_ON(LZMS_PROBABILITY_MAX != sizeof(prob_entry->recent_bits) * 8);
+
+       delta_zero_bits = (s32)(prob_entry->recent_bits >> (LZMS_PROBABILITY_MAX - 1)) - bit;
+
+       prob_entry->num_recent_zero_bits += delta_zero_bits;
+       prob_entry->recent_bits <<= 1;
+       prob_entry->recent_bits |= bit;
+}
+
+/* Given a probability entry, return the chance out of LZMS_PROBABILITY_MAX that
+ * the next decoded bit will be a 0.  */
+static inline u32
+lzms_get_probability(const struct lzms_probability_entry *prob_entry)
+{
+       u32 prob;
+
+       prob = prob_entry->num_recent_zero_bits;
+
+       /* 0% and 100% probabilities aren't allowed.  */
+       if (prob == 0)
+               prob++;
+       if (prob == LZMS_PROBABILITY_MAX)
+               prob--;
+       return prob;
+}
+
 #endif /* _WIMLIB_LZMS_H  */
 #endif /* _WIMLIB_LZMS_H  */
diff --git a/include/wimlib/lzms_constants.h b/include/wimlib/lzms_constants.h
new file mode 100644 (file)
index 0000000..3bc5761
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * lzms_constants.h
+ *
+ * Constants for the LZMS compression format.
+ */
+
+#ifndef _LZMS_CONSTANTS_H
+#define _LZMS_CONSTANTS_H
+
+#define LZMS_NUM_RECENT_OFFSETS                        3
+#define LZMS_MAX_INIT_RECENT_OFFSET            (LZMS_NUM_RECENT_OFFSETS + 1)
+#define LZMS_OFFSET_OFFSET                     (LZMS_NUM_RECENT_OFFSETS - 1)
+
+#define LZMS_PROBABILITY_BITS                  6
+#define LZMS_PROBABILITY_MAX                   (1U << LZMS_PROBABILITY_BITS)
+#define LZMS_INITIAL_PROBABILITY               48
+#define LZMS_INITIAL_RECENT_BITS               0x0000000055555555ULL
+
+#define LZMS_NUM_MAIN_STATES                   16
+#define LZMS_NUM_MATCH_STATES                  32
+#define LZMS_NUM_LZ_MATCH_STATES               64
+#define LZMS_NUM_LZ_REPEAT_MATCH_STATES                64
+#define LZMS_NUM_DELTA_MATCH_STATES            64
+#define LZMS_NUM_DELTA_REPEAT_MATCH_STATES     64
+#define LZMS_MAX_NUM_STATES                    64
+
+#define LZMS_NUM_LITERAL_SYMS                  256
+#define LZMS_NUM_LEN_SYMS                      54
+#define LZMS_NUM_DELTA_POWER_SYMS              8
+#define LZMS_MAX_NUM_OFFSET_SYMS               799
+#define LZMS_MAX_NUM_SYMS                      799
+
+#define LZMS_MAX_CODEWORD_LEN                  15
+
+#define LZMS_LITERAL_CODE_REBUILD_FREQ         1024
+#define LZMS_LZ_OFFSET_CODE_REBUILD_FREQ       1024
+#define LZMS_LENGTH_CODE_REBUILD_FREQ          512
+#define LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ    1024
+#define LZMS_DELTA_POWER_CODE_REBUILD_FREQ     512
+
+#define LZMS_X86_MAX_GOOD_TARGET_OFFSET                65535
+#define LZMS_X86_MAX_TRANSLATION_OFFSET                1023
+
+#endif /* _LZMS_CONSTANTS_H */
index 97d4a691c62ab46f13de4a66840af687a8b99d58..da0c55143c4bc9a569bdc5550b51c0e16c16d80b 100644 (file)
 #      define LZX_ASSERT(...)
 #endif
 
 #      define LZX_ASSERT(...)
 #endif
 
-#define USE_LZX_EXTRA_BITS_ARRAY
+extern const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS];
 
 
-#ifdef USE_LZX_EXTRA_BITS_ARRAY
-extern const u8 lzx_extra_bits[LZX_MAX_POSITION_SLOTS];
-#endif
-
-/* Given the number of an LZX position slot, return the number of extra bits that
- * are needed to encode the match offset. */
-static inline unsigned
-lzx_get_num_extra_bits(unsigned position_slot)
-{
-#ifdef USE_LZX_EXTRA_BITS_ARRAY
-       /* Use a table */
-       return lzx_extra_bits[position_slot];
-#else
-       /* Calculate directly using a shift and subtraction. */
-       LZX_ASSERT(position_slot >= 2 && position_slot <= 37);
-       return (position_slot >> 1) - 1;
-#endif
-}
-
-extern const u32 lzx_position_base[LZX_MAX_POSITION_SLOTS];
+extern const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS];
 
 
-/* Returns the LZX position slot that corresponds to a given formatted offset.
+/* Returns the LZX offset slot that corresponds to a given adjusted offset.
  *
  * Logically, this returns the smallest i such that
  *
  * Logically, this returns the smallest i such that
- * formatted_offset >= lzx_position_base[i].
+ * adjusted_offset >= lzx_offset_slot_base[i].
  *
  * The actual implementation below takes advantage of the regularity of the
  *
  * The actual implementation below takes advantage of the regularity of the
- * numbers in the lzx_position_base array to calculate the slot directly from
- * the formatted offset without actually looking at the array.
+ * numbers in the lzx_offset_slot_base array to calculate the slot directly from
+ * the adjusted offset without actually looking at the array.
  */
 static inline unsigned
  */
 static inline unsigned
-lzx_get_position_slot_raw(u32 formatted_offset)
+lzx_get_offset_slot_raw(u32 adjusted_offset)
 {
 {
-       if (formatted_offset >= 196608) {
-               return (formatted_offset >> 17) + 34;
+       if (adjusted_offset >= 196608) {
+               return (adjusted_offset >> 17) + 34;
        } else {
        } else {
-               LZX_ASSERT(2 <= formatted_offset && formatted_offset < 655360);
-               unsigned mssb_idx = bsr32(formatted_offset);
+               LZX_ASSERT(2 <= adjusted_offset && adjusted_offset < 655360);
+               unsigned mssb_idx = bsr32(adjusted_offset);
                return (mssb_idx << 1) |
                return (mssb_idx << 1) |
-                       ((formatted_offset >> (mssb_idx - 1)) & 1);
+                       ((adjusted_offset >> (mssb_idx - 1)) & 1);
        }
 }
 
        }
 }
 
@@ -72,13 +53,7 @@ extern unsigned lzx_get_num_main_syms(unsigned window_order);
 /* Least-recently used queue for match offsets.  */
 struct lzx_lru_queue {
        u32 R[LZX_NUM_RECENT_OFFSETS];
 /* Least-recently used queue for match offsets.  */
 struct lzx_lru_queue {
        u32 R[LZX_NUM_RECENT_OFFSETS];
-}
-#ifdef __x86_64__
-_aligned_attribute(8)  /* Improves performance of LZX compression by 1% - 2%;
-                         specifically, this speeds up
-                         lzx_choose_near_optimal_item().  */
-#endif
-;
+} _aligned_attribute(sizeof(unsigned long));
 
 /* Initialize the LZX least-recently-used match offset queue at the beginning of
  * a new window for either decompression or compression.  */
 
 /* Initialize the LZX least-recently-used match offset queue at the beginning of
  * a new window for either decompression or compression.  */
index 36e812d8b80090c1054fd11a7a2a1cfe5f60a2c2..49cf8fafe4b59e471e31af0ac32e2a0a75c91a9e 100644 (file)
@@ -24,9 +24,9 @@
  * + LZX_MIN_MATCH_LEN, and a length symbol follows.  */
 #define LZX_NUM_PRIMARY_LENS         7
 
  * + LZX_MIN_MATCH_LEN, and a length symbol follows.  */
 #define LZX_NUM_PRIMARY_LENS         7
 
-/* Maximum number of position slots.  The actual number of position slots will
+/* Maximum number of offset slots.  The actual number of offset slots will
  * depend on the window size.  */
  * depend on the window size.  */
-#define LZX_MAX_POSITION_SLOTS 51
+#define LZX_MAX_OFFSET_SLOTS   51
 
 #define LZX_MIN_WINDOW_ORDER   15
 #define LZX_MAX_WINDOW_ORDER   21
 
 #define LZX_MIN_WINDOW_ORDER   15
 #define LZX_MAX_WINDOW_ORDER   21
@@ -35,7 +35,7 @@
 
 /* Maximum number of symbols in the main code.  The actual number of symbols in
  * the main code will depend on the window size.  */
 
 /* Maximum number of symbols in the main code.  The actual number of symbols in
  * the main code will depend on the window size.  */
-#define LZX_MAINCODE_MAX_NUM_SYMBOLS   (LZX_NUM_CHARS + (LZX_MAX_POSITION_SLOTS << 3))
+#define LZX_MAINCODE_MAX_NUM_SYMBOLS   (LZX_NUM_CHARS + (LZX_MAX_OFFSET_SLOTS << 3))
 
 /* Number of symbols in the length code.  */
 #define LZX_LENCODE_NUM_SYMBOLS                249
 
 /* Number of symbols in the length code.  */
 #define LZX_LENCODE_NUM_SYMBOLS                249
index e274c6ba6bc2bc54e1f20fceddeabdc8bbdc89de..4363623a4b1fba1e7267d6881f4e8f97501d28be 100644 (file)
@@ -6,7 +6,7 @@
  */
 
 /*
  */
 
 /*
- * Copyright (C) 2013 Eric Biggers
+ * Copyright (C) 2013, 2014 Eric Biggers
  *
  * This file is part of wimlib, a library for working with WIM files.
  *
  *
  * This file is part of wimlib, a library for working with WIM files.
  *
  * Constant tables initialized by lzms_compute_slots():        *
  ***************************************************************/
 
  * Constant tables initialized by lzms_compute_slots():        *
  ***************************************************************/
 
-/* Table: position slot => position slot base value  */
-u32 lzms_position_slot_base[LZMS_MAX_NUM_OFFSET_SYMS + 1];
+/* Table: offset slot => offset slot base value  */
+u32 lzms_offset_slot_base[LZMS_MAX_NUM_OFFSET_SYMS + 1];
 
 
-/* Table: position slot => number of extra position bits  */
-u8 lzms_extra_position_bits[LZMS_MAX_NUM_OFFSET_SYMS];
-
-/* Table: log2(position) => [lower bound, upper bound] on position slot  */
-u16 lzms_order_to_position_slot_bounds[30][2];
+/* Table: offset slot => number of extra offset bits  */
+u8 lzms_extra_offset_bits[LZMS_MAX_NUM_OFFSET_SYMS];
 
 /* Table: length slot => length slot base value  */
 u32 lzms_length_slot_base[LZMS_NUM_LEN_SYMS + 1];
 
 /* Table: length slot => length slot base value  */
 u32 lzms_length_slot_base[LZMS_NUM_LEN_SYMS + 1];
@@ -53,17 +50,14 @@ u32 lzms_length_slot_base[LZMS_NUM_LEN_SYMS + 1];
 /* Table: length slot => number of extra length bits  */
 u8 lzms_extra_length_bits[LZMS_NUM_LEN_SYMS];
 
 /* Table: length slot => number of extra length bits  */
 u8 lzms_extra_length_bits[LZMS_NUM_LEN_SYMS];
 
-/* Table: length (< LZMS_NUM_FAST_LENGTHS only) => length slot  */
-u8 lzms_length_slot_fast[LZMS_NUM_FAST_LENGTHS];
-
-u32
+unsigned
 lzms_get_slot(u32 value, const u32 slot_base_tab[], unsigned num_slots)
 {
 lzms_get_slot(u32 value, const u32 slot_base_tab[], unsigned num_slots)
 {
-       u32 l = 0;
-       u32 r = num_slots - 1;
+       unsigned l = 0;
+       unsigned r = num_slots - 1;
        for (;;) {
                LZMS_ASSERT(r >= l);
        for (;;) {
                LZMS_ASSERT(r >= l);
-               u32 slot = (l + r) / 2;
+               unsigned slot = (l + r) / 2;
                if (value >= slot_base_tab[slot]) {
                        if (value < slot_base_tab[slot + 1])
                                return slot;
                if (value >= slot_base_tab[slot]) {
                        if (value < slot_base_tab[slot + 1])
                                return slot;
@@ -79,16 +73,16 @@ static void
 lzms_decode_delta_rle_slot_bases(u32 slot_bases[],
                                 u8 extra_bits[],
                                 const u8 delta_run_lens[],
 lzms_decode_delta_rle_slot_bases(u32 slot_bases[],
                                 u8 extra_bits[],
                                 const u8 delta_run_lens[],
-                                u32 num_run_lens,
+                                unsigned num_run_lens,
                                 u32 final,
                                 u32 final,
-                                u32 expected_num_slots)
+                                unsigned expected_num_slots)
 {
 {
-       u32 order = 0;
+       unsigned order = 0;
        u32 delta = 1;
        u32 base = 0;
        u32 delta = 1;
        u32 base = 0;
-       u32 slot = 0;
-       for (u32 i = 0; i < num_run_lens; i++) {
-               u8 run_len = delta_run_lens[i];
+       unsigned slot = 0;
+       for (unsigned i = 0; i < num_run_lens; i++) {
+               unsigned run_len = delta_run_lens[i];
                while (run_len--) {
                        base += delta;
                        if (slot > 0)
                while (run_len--) {
                        base += delta;
                        if (slot > 0)
@@ -105,17 +99,17 @@ lzms_decode_delta_rle_slot_bases(u32 slot_bases[],
        extra_bits[slot - 1] = bsr32(slot_bases[slot] - slot_bases[slot - 1]);
 }
 
        extra_bits[slot - 1] = bsr32(slot_bases[slot] - slot_bases[slot - 1]);
 }
 
-/* Initialize the global position and length slot tables.  */
+/* Initialize the global offset and length slot tables.  */
 static void
 lzms_compute_slots(void)
 {
 static void
 lzms_compute_slots(void)
 {
-       /* If an explicit formula that maps LZMS position and length slots to
-        * slot bases exists, then it could be used here.  But until one is
-        * found, the following code fills in the slots using the observation
-        * that the increase from one slot base to the next is an increasing
-        * power of 2.  Therefore, run-length encoding of the delta of adjacent
-        * entries can be used.  */
-       static const u8 position_slot_delta_run_lens[] = {
+       /* If an explicit formula that maps LZMS offset and length slots to slot
+        * bases exists, then it could be used here.  But until one is found,
+        * the following code fills in the slots using the observation that the
+        * increase from one slot base to the next is an increasing power of 2.
+        * Therefore, run-length encoding of the delta of adjacent entries can
+        * be used.  */
+       static const u8 offset_slot_delta_run_lens[] = {
                9,   0,   9,   7,   10,  15,  15,  20,
                20,  30,  33,  40,  42,  45,  60,  73,
                80,  85,  95,  105, 6,
                9,   0,   9,   7,   10,  15,  15,  20,
                20,  30,  33,  40,  42,  45,  60,  73,
                80,  85,  95,  105, 6,
@@ -127,23 +121,14 @@ lzms_compute_slots(void)
                1,
        };
 
                1,
        };
 
-       /* Position slots  */
-       lzms_decode_delta_rle_slot_bases(lzms_position_slot_base,
-                                        lzms_extra_position_bits,
-                                        position_slot_delta_run_lens,
-                                        ARRAY_LEN(position_slot_delta_run_lens),
+       /* Offset slots  */
+       lzms_decode_delta_rle_slot_bases(lzms_offset_slot_base,
+                                        lzms_extra_offset_bits,
+                                        offset_slot_delta_run_lens,
+                                        ARRAY_LEN(offset_slot_delta_run_lens),
                                         0x7fffffff,
                                         LZMS_MAX_NUM_OFFSET_SYMS);
 
                                         0x7fffffff,
                                         LZMS_MAX_NUM_OFFSET_SYMS);
 
-       for (u32 order = 0; order < 30; order++) {
-               lzms_order_to_position_slot_bounds[order][0] =
-                       lzms_get_slot(1U << order, lzms_position_slot_base,
-                                     LZMS_MAX_NUM_OFFSET_SYMS);
-               lzms_order_to_position_slot_bounds[order][1] =
-                       lzms_get_slot((1U << (order + 1)) - 1, lzms_position_slot_base,
-                                     LZMS_MAX_NUM_OFFSET_SYMS);
-       }
-
        /* Length slots  */
        lzms_decode_delta_rle_slot_bases(lzms_length_slot_base,
                                         lzms_extra_length_bits,
        /* Length slots  */
        lzms_decode_delta_rle_slot_bases(lzms_length_slot_base,
                                         lzms_extra_length_bits,
@@ -151,17 +136,9 @@ lzms_compute_slots(void)
                                         ARRAY_LEN(length_slot_delta_run_lens),
                                         0x400108ab,
                                         LZMS_NUM_LEN_SYMS);
                                         ARRAY_LEN(length_slot_delta_run_lens),
                                         0x400108ab,
                                         LZMS_NUM_LEN_SYMS);
-
-       /* Create table mapping short lengths to length slots.  */
-       for (u32 slot = 0, i = 0; i < LZMS_NUM_FAST_LENGTHS; i++) {
-               if (i >= lzms_length_slot_base[slot + 1])
-                       slot++;
-               lzms_length_slot_fast[i] = slot;
-       }
 }
 
 }
 
-/* Initialize the global position and length slot tables if not done so already.
- * */
+/* Initialize the global offset and length slot tables if not already done.  */
 void
 lzms_init_slots(void)
 {
 void
 lzms_init_slots(void)
 {
@@ -337,7 +314,7 @@ lzms_x86_filter(u8 data[restrict], s32 size,
        }
 }
 
        }
 }
 
-static void
+void
 lzms_init_lz_lru_queues(struct lzms_lz_lru_queues *lz)
 {
        /* Recent offsets for LZ matches  */
 lzms_init_lz_lru_queues(struct lzms_lz_lru_queues *lz)
 {
        /* Recent offsets for LZ matches  */
@@ -348,7 +325,7 @@ lzms_init_lz_lru_queues(struct lzms_lz_lru_queues *lz)
        lz->upcoming_offset = 0;
 }
 
        lz->upcoming_offset = 0;
 }
 
-static void
+void
 lzms_init_delta_lru_queues(struct lzms_delta_lru_queues *delta)
 {
        /* Recent offsets and powers for LZ matches  */
 lzms_init_delta_lru_queues(struct lzms_delta_lru_queues *delta)
 {
        /* Recent offsets and powers for LZ matches  */
@@ -371,7 +348,7 @@ lzms_init_lru_queues(struct lzms_lru_queues *lru)
 }
 
 void
 }
 
 void
-lzms_update_lz_lru_queues(struct lzms_lz_lru_queues *lz)
+lzms_update_lz_lru_queue(struct lzms_lz_lru_queues *lz)
 {
        if (lz->prev_offset != 0) {
                for (int i = LZMS_NUM_RECENT_OFFSETS - 1; i >= 0; i--)
 {
        if (lz->prev_offset != 0) {
                for (int i = LZMS_NUM_RECENT_OFFSETS - 1; i >= 0; i--)
@@ -400,6 +377,6 @@ lzms_update_delta_lru_queues(struct lzms_delta_lru_queues *delta)
 void
 lzms_update_lru_queues(struct lzms_lru_queues *lru)
 {
 void
 lzms_update_lru_queues(struct lzms_lru_queues *lru)
 {
-       lzms_update_lz_lru_queues(&lru->lz);
+       lzms_update_lz_lru_queue(&lru->lz);
        lzms_update_delta_lru_queues(&lru->delta);
 }
        lzms_update_delta_lru_queues(&lru->delta);
 }
index 8ad19bc0d1cb2820524d4cde3b6f680b3bb45326..fb1f777ac6b5c3adeb69c1a8bf7508e603baf92f 100644 (file)
@@ -1,5 +1,7 @@
 /*
  * lzms-compress.c
 /*
  * lzms-compress.c
+ *
+ * A compressor that produces output compatible with the LZMS compression format.
  */
 
 /*
  */
 
 /*
  * along with wimlib; if not, see http://www.gnu.org/licenses/.
  */
 
  * along with wimlib; if not, see http://www.gnu.org/licenses/.
  */
 
-/* This a compressor for the LZMS compression format.  More details about this
- * format can be found in lzms-decompress.c.
- *
- * Also see lzx-compress.c for general information about match-finding and
- * match-choosing that also applies to this LZMS compressor.
- *
- * NOTE: this compressor currently does not code any delta matches.
- */
-
 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #endif
 
 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #endif
 
-#include "wimlib/assert.h"
-#include "wimlib/compiler.h"
-#include "wimlib/compressor_ops.h"
 #include "wimlib/compress_common.h"
 #include "wimlib/compress_common.h"
+#include "wimlib/compressor_ops.h"
 #include "wimlib/endianness.h"
 #include "wimlib/error.h"
 #include "wimlib/lz_mf.h"
 #include "wimlib/endianness.h"
 #include "wimlib/error.h"
 #include "wimlib/lz_mf.h"
 #include <limits.h>
 #include <pthread.h>
 
 #include <limits.h>
 #include <pthread.h>
 
-/* Stucture used for writing raw bits to the end of the LZMS-compressed data as
- * a series of 16-bit little endian coding units.  */
+/* Stucture used for writing raw bits as a series of 16-bit little endian coding
+ * units.  This starts at the *end* of the compressed data buffer and proceeds
+ * backwards.  */
 struct lzms_output_bitstream {
 struct lzms_output_bitstream {
-       /* Buffer variable containing zero or more bits that have been logically
-        * written to the bitstream but not yet written to memory.  This must be
-        * at least as large as the coding unit size.  */
-       u16 bitbuf;
 
 
-       /* Number of bits in @bitbuf that are valid.  */
-       unsigned num_free_bits;
+       /* Bits that haven't yet been written to the output buffer.  */
+       u64 bitbuf;
+
+       /* Number of bits currently held in @bitbuf.  */
+       unsigned bitcount;
 
        /* Pointer to one past the next position in the compressed data buffer
         * at which to output a 16-bit coding unit.  */
 
        /* Pointer to one past the next position in the compressed data buffer
         * at which to output a 16-bit coding unit.  */
-       le16 *out;
+       le16 *next;
 
 
-       /* Maximum number of 16-bit coding units that can still be output to
-        * the compressed data buffer.  */
-       size_t num_le16_remaining;
-
-       /* Set to %true if not all coding units could be output due to
-        * insufficient space.  */
-       bool overrun;
+       /* Pointer to the beginning of the output buffer.  (The "end" when
+        * writing backwards!)  */
+       le16 *begin;
 };
 
 };
 
-/* Stucture used for range encoding (raw version).  */
+/* Stucture used for range encoding (raw version).  This starts at the
+ * *beginning* of the compressed data buffer and proceeds forward.  */
 struct lzms_range_encoder_raw {
 
        /* A 33-bit variable that holds the low boundary of the current range.
 struct lzms_range_encoder_raw {
 
        /* A 33-bit variable that holds the low boundary of the current range.
@@ -91,25 +79,21 @@ struct lzms_range_encoder_raw {
         * subsequent such coding units are 0xffff.  */
        u32 cache_size;
 
         * subsequent such coding units are 0xffff.  */
        u32 cache_size;
 
-       /* Pointer to the next position in the compressed data buffer at which
-        * to output a 16-bit coding unit.  */
-       le16 *out;
-
-       /* Maximum number of 16-bit coding units that can still be output to
-        * the compressed data buffer.  */
-       size_t num_le16_remaining;
+       /* Pointer to the beginning of the output buffer.  */
+       le16 *begin;
 
 
-       /* %true when the very first coding unit has not yet been output.  */
-       bool first;
+       /* Pointer to the position in the output buffer at which the next coding
+        * unit must be written.  */
+       le16 *next;
 
 
-       /* Set to %true if not all coding units could be output due to
-        * insufficient space.  */
-       bool overrun;
+       /* Pointer just past the end of the output buffer.  */
+       le16 *end;
 };
 
 /* Structure used for range encoding.  This wraps around `struct
  * lzms_range_encoder_raw' to use and maintain probability entries.  */
 struct lzms_range_encoder {
 };
 
 /* Structure used for range encoding.  This wraps around `struct
  * lzms_range_encoder_raw' to use and maintain probability entries.  */
 struct lzms_range_encoder {
+
        /* Pointer to the raw range encoder, which has no persistent knowledge
         * of probabilities.  Multiple lzms_range_encoder's share the same
         * lzms_range_encoder_raw.  */
        /* Pointer to the raw range encoder, which has no persistent knowledge
         * of probabilities.  Multiple lzms_range_encoder's share the same
         * lzms_range_encoder_raw.  */
@@ -157,6 +141,7 @@ struct lzms_huffman_encoder {
        u32 codewords[LZMS_MAX_NUM_SYMS];
 };
 
        u32 codewords[LZMS_MAX_NUM_SYMS];
 };
 
+/* Internal compression parameters  */
 struct lzms_compressor_params {
        u32 min_match_length;
        u32 nice_match_length;
 struct lzms_compressor_params {
        u32 min_match_length;
        u32 nice_match_length;
@@ -164,44 +149,35 @@ struct lzms_compressor_params {
        u32 optim_array_length;
 };
 
        u32 optim_array_length;
 };
 
-/* State of the LZMS compressor.  */
+/* State of the LZMS compressor  */
 struct lzms_compressor {
 struct lzms_compressor {
-       /* Pointer to a buffer holding the preprocessed data to compress.  */
-       u8 *window;
 
 
-       /* Current position in @buffer.  */
-       u32 cur_window_pos;
+       /* Internal compression parameters  */
+       struct lzms_compressor_params params;
 
 
-       /* Size of the data in @buffer.  */
-       u32 window_size;
+       /* Data currently being compressed  */
+       u8 *cur_window;
+       u32 cur_window_size;
 
 
-       /* Lempel-Ziv match-finder.  */
+       /* Lempel-Ziv match-finder  */
        struct lz_mf *mf;
 
        struct lz_mf *mf;
 
-       /* Temporary space to store found matches.  */
+       /* Temporary space to store found matches  */
        struct lz_match *matches;
 
        struct lz_match *matches;
 
-       /* Match-chooser data.  */
+       /* Per-position data for near-optimal parsing  */
        struct lzms_mc_pos_data *optimum;
        struct lzms_mc_pos_data *optimum;
-       unsigned optimum_cur_idx;
-       unsigned optimum_end_idx;
-
-       /* Maximum block size this compressor instantiation allows.  This is the
-        * allocated size of @window.  */
-       u32 max_block_size;
-
-       /* Compression parameters.  */
-       struct lzms_compressor_params params;
+       struct lzms_mc_pos_data *optimum_end;
 
        /* Raw range encoder which outputs to the beginning of the compressed
 
        /* Raw range encoder which outputs to the beginning of the compressed
-        * data buffer, proceeding forwards.  */
+        * data buffer, proceeding forwards  */
        struct lzms_range_encoder_raw rc;
 
        /* Bitstream which outputs to the end of the compressed data buffer,
        struct lzms_range_encoder_raw rc;
 
        /* Bitstream which outputs to the end of the compressed data buffer,
-        * proceeding backwards.  */
+        * proceeding backwards  */
        struct lzms_output_bitstream os;
 
        struct lzms_output_bitstream os;
 
-       /* Range encoders.  */
+       /* Range encoders  */
        struct lzms_range_encoder main_range_encoder;
        struct lzms_range_encoder match_range_encoder;
        struct lzms_range_encoder lz_match_range_encoder;
        struct lzms_range_encoder main_range_encoder;
        struct lzms_range_encoder match_range_encoder;
        struct lzms_range_encoder lz_match_range_encoder;
@@ -209,33 +185,79 @@ struct lzms_compressor {
        struct lzms_range_encoder delta_match_range_encoder;
        struct lzms_range_encoder delta_repeat_match_range_encoders[LZMS_NUM_RECENT_OFFSETS - 1];
 
        struct lzms_range_encoder delta_match_range_encoder;
        struct lzms_range_encoder delta_repeat_match_range_encoders[LZMS_NUM_RECENT_OFFSETS - 1];
 
-       /* Huffman encoders.  */
+       /* Huffman encoders  */
        struct lzms_huffman_encoder literal_encoder;
        struct lzms_huffman_encoder lz_offset_encoder;
        struct lzms_huffman_encoder length_encoder;
        struct lzms_huffman_encoder delta_power_encoder;
        struct lzms_huffman_encoder delta_offset_encoder;
 
        struct lzms_huffman_encoder literal_encoder;
        struct lzms_huffman_encoder lz_offset_encoder;
        struct lzms_huffman_encoder length_encoder;
        struct lzms_huffman_encoder delta_power_encoder;
        struct lzms_huffman_encoder delta_offset_encoder;
 
-       /* LRU (least-recently-used) queues for match information.  */
-       struct lzms_lru_queues lru;
-
-       /* Used for preprocessing.  */
+       /* Used for preprocessing  */
        s32 last_target_usages[65536];
        s32 last_target_usages[65536];
+
+#define LZMS_NUM_FAST_LENGTHS 256
+       /* Table: length => length slot for small lengths  */
+       u8 length_slot_fast[LZMS_NUM_FAST_LENGTHS];
+
+       /* Table: length => current cost for small match lengths  */
+       u32 length_cost_fast[LZMS_NUM_FAST_LENGTHS];
+
+#define LZMS_NUM_FAST_OFFSETS 32768
+       /* Table: offset => offset slot for small offsets  */
+       u8 offset_slot_fast[LZMS_NUM_FAST_OFFSETS];
 };
 
 };
 
+/*
+ * Match chooser position data:
+ *
+ * An array of these structures is used during the near-optimal match-choosing
+ * algorithm.  They correspond to consecutive positions in the window and are
+ * used to keep track of the cost to reach each position, and the match/literal
+ * choices that need to be chosen to reach that position.
+ */
 struct lzms_mc_pos_data {
 struct lzms_mc_pos_data {
+
+       /* The cost, in bits, of the lowest-cost path that has been found to
+        * reach this position.  This can change as progressively lower cost
+        * paths are found to reach this position.  */
        u32 cost;
        u32 cost;
-#define MC_INFINITE_COST ((u32)~0UL)
-       union {
-               struct {
-                       u32 link;
-                       u32 match_offset;
-               } prev;
-               struct {
-                       u32 link;
-                       u32 match_offset;
-               } next;
-       };
+#define MC_INFINITE_COST UINT32_MAX
+
+       /* The match or literal that was taken to reach this position.  This can
+        * change as progressively lower cost paths are found to reach this
+        * position.
+        *
+        * This variable is divided into two bitfields.
+        *
+        * Literals:
+        *      Low bits are 1, high bits are the literal.
+        *
+        * Explicit offset matches:
+        *      Low bits are the match length, high bits are the offset plus 2.
+        *
+        * Repeat offset matches:
+        *      Low bits are the match length, high bits are the queue index.
+        */
+       u64 mc_item_data;
+#define MC_OFFSET_SHIFT 32
+#define MC_LEN_MASK (((u64)1 << MC_OFFSET_SHIFT) - 1)
+
+       /* The LZMS adaptive state that exists at this position.  This is filled
+        * in lazily, only after the minimum-cost path to this position is
+        * found.
+        *
+        * Note: the way we handle this adaptive state in the "minimum-cost"
+        * parse is actually only an approximation.  It's possible for the
+        * globally optimal, minimum cost path to contain a prefix, ending at a
+        * position, where that path prefix is *not* the minimum cost path to
+        * that position.  This can happen if such a path prefix results in a
+        * different adaptive state which results in lower costs later.  We do
+        * not solve this problem; we only consider the lowest cost to reach
+        * each position, which seems to be an acceptable approximation.
+        *
+        * Note: this adaptive state also does not include the probability
+        * entries or current Huffman codewords.  Those aren't maintained
+        * per-position and are only updated occassionally.  */
        struct lzms_adaptive_state {
                struct lzms_lz_lru_queues lru;
                u8 main_state;
        struct lzms_adaptive_state {
                struct lzms_lz_lru_queues lru;
                u8 main_state;
@@ -245,60 +267,110 @@ struct lzms_mc_pos_data {
        } state;
 };
 
        } state;
 };
 
-/* Initialize the output bitstream @os to write forwards to the specified
+static void
+lzms_init_fast_slots(struct lzms_compressor *c)
+{
+       /* Create table mapping small lengths to length slots.  */
+       for (unsigned slot = 0, i = 0; i < LZMS_NUM_FAST_LENGTHS; i++) {
+               while (i >= lzms_length_slot_base[slot + 1])
+                       slot++;
+               c->length_slot_fast[i] = slot;
+       }
+
+       /* Create table mapping small offsets to offset slots.  */
+       for (unsigned slot = 0, i = 0; i < LZMS_NUM_FAST_OFFSETS; i++) {
+               while (i >= lzms_offset_slot_base[slot + 1])
+                       slot++;
+               c->offset_slot_fast[i] = slot;
+       }
+}
+
+static inline unsigned
+lzms_get_length_slot_fast(const struct lzms_compressor *c, u32 length)
+{
+       if (likely(length < LZMS_NUM_FAST_LENGTHS))
+               return c->length_slot_fast[length];
+       else
+               return lzms_get_length_slot(length);
+}
+
+static inline unsigned
+lzms_get_offset_slot_fast(const struct lzms_compressor *c, u32 offset)
+{
+       if (offset < LZMS_NUM_FAST_OFFSETS)
+               return c->offset_slot_fast[offset];
+       else
+               return lzms_get_offset_slot(offset);
+}
+
+/* Initialize the output bitstream @os to write backwards to the specified
  * compressed data buffer @out that is @out_limit 16-bit integers long.  */
 static void
 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
                           le16 *out, size_t out_limit)
 {
        os->bitbuf = 0;
  * compressed data buffer @out that is @out_limit 16-bit integers long.  */
 static void
 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
                           le16 *out, size_t out_limit)
 {
        os->bitbuf = 0;
-       os->num_free_bits = 16;
-       os->out = out + out_limit;
-       os->num_le16_remaining = out_limit;
-       os->overrun = false;
+       os->bitcount = 0;
+       os->next = out + out_limit;
+       os->begin = out;
 }
 
 }
 
-/* Write @num_bits bits, contained in the low @num_bits bits of @bits (ordered
- * from high-order to low-order), to the output bitstream @os.  */
-static void
-lzms_output_bitstream_put_bits(struct lzms_output_bitstream *os,
-                              u32 bits, unsigned num_bits)
+/*
+ * Write some bits, contained in the low @num_bits bits of @bits (ordered from
+ * high-order to low-order), to the output bitstream @os.
+ *
+ * @max_num_bits is a compile-time constant that specifies the maximum number of
+ * bits that can ever be written at this call site.
+ */
+static inline void
+lzms_output_bitstream_put_varbits(struct lzms_output_bitstream *os,
+                                 u32 bits, unsigned num_bits,
+                                 unsigned max_num_bits)
 {
 {
-       bits &= (1U << num_bits) - 1;
+       LZMS_ASSERT(num_bits <= 48);
 
 
-       while (num_bits > os->num_free_bits) {
+       /* Add the bits to the bit buffer variable.  */
+       os->bitcount += num_bits;
+       os->bitbuf = (os->bitbuf << num_bits) | bits;
 
 
-               if (unlikely(os->num_le16_remaining == 0)) {
-                       os->overrun = true;
-                       return;
-               }
+       /* Check whether any coding units need to be written.  */
+       while (os->bitcount >= 16) {
 
 
-               unsigned num_fill_bits = os->num_free_bits;
+               os->bitcount -= 16;
 
 
-               os->bitbuf <<= num_fill_bits;
-               os->bitbuf |= bits >> (num_bits - num_fill_bits);
+               /* Write a coding unit, unless it would underflow the buffer. */
+               if (os->next != os->begin)
+                       *--os->next = cpu_to_le16(os->bitbuf >> os->bitcount);
 
 
-               *--os->out = cpu_to_le16(os->bitbuf);
-               --os->num_le16_remaining;
-
-               os->num_free_bits = 16;
-               num_bits -= num_fill_bits;
-               bits &= (1U << num_bits) - 1;
+               /* Optimization for call sites that never write more than 16
+                * bits at once.  */
+               if (max_num_bits <= 16)
+                       break;
        }
        }
-       os->bitbuf <<= num_bits;
-       os->bitbuf |= bits;
-       os->num_free_bits -= num_bits;
+}
+
+/* Use when @num_bits is a compile-time constant.  Otherwise use
+ * lzms_output_bitstream_put_bits().  */
+static inline void
+lzms_output_bitstream_put_bits(struct lzms_output_bitstream *os,
+                              u32 bits, unsigned num_bits)
+{
+       lzms_output_bitstream_put_varbits(os, bits, num_bits, num_bits);
 }
 
 /* Flush the output bitstream, ensuring that all bits written to it have been
 }
 
 /* Flush the output bitstream, ensuring that all bits written to it have been
- * written to memory.  Returns %true if all bits were output successfully, or
- * %false if an overrun occurred.  */
+ * written to memory.  Returns %true if all bits have been output successfully,
+ * or %false if an overrun occurred.  */
 static bool
 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
 {
 static bool
 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
 {
-       if (os->num_free_bits != 16)
-               lzms_output_bitstream_put_bits(os, 0, os->num_free_bits + 1);
-       return !os->overrun;
+       if (os->next == os->begin)
+               return false;
+
+       if (os->bitcount != 0)
+               *--os->next = cpu_to_le16(os->bitbuf << (16 - os->bitcount));
+
+       return true;
 }
 
 /* Initialize the range encoder @rc to write forwards to the specified
 }
 
 /* Initialize the range encoder @rc to write forwards to the specified
@@ -311,10 +383,9 @@ lzms_range_encoder_raw_init(struct lzms_range_encoder_raw *rc,
        rc->range = 0xffffffff;
        rc->cache = 0;
        rc->cache_size = 1;
        rc->range = 0xffffffff;
        rc->cache = 0;
        rc->cache_size = 1;
-       rc->out = out;
-       rc->num_le16_remaining = out_limit;
-       rc->first = true;
-       rc->overrun = false;
+       rc->begin = out;
+       rc->next = out - 1;
+       rc->end = out + out_limit;
 }
 
 /*
 }
 
 /*
@@ -334,26 +405,19 @@ lzms_range_encoder_raw_init(struct lzms_range_encoder_raw *rc,
 static void
 lzms_range_encoder_raw_shift_low(struct lzms_range_encoder_raw *rc)
 {
 static void
 lzms_range_encoder_raw_shift_low(struct lzms_range_encoder_raw *rc)
 {
-       LZMS_DEBUG("low=%"PRIx64", cache=%"PRIx64", cache_size=%u",
-                  rc->low, rc->cache, rc->cache_size);
        if ((u32)(rc->low) < 0xffff0000 ||
            (u32)(rc->low >> 32) != 0)
        {
                /* Carry not needed (rc->low < 0xffff0000), or carry occurred
                 * ((rc->low >> 32) != 0, a.k.a. the carry bit is 1).  */
                do {
        if ((u32)(rc->low) < 0xffff0000 ||
            (u32)(rc->low >> 32) != 0)
        {
                /* Carry not needed (rc->low < 0xffff0000), or carry occurred
                 * ((rc->low >> 32) != 0, a.k.a. the carry bit is 1).  */
                do {
-                       if (!rc->first) {
-                               if (rc->num_le16_remaining == 0) {
-                                       rc->overrun = true;
-                                       return;
-                               }
-                               *rc->out++ = cpu_to_le16(rc->cache +
-                                                        (u16)(rc->low >> 32));
-                               --rc->num_le16_remaining;
+                       if (likely(rc->next >= rc->begin)) {
+                               if (rc->next != rc->end)
+                                       *rc->next++ = cpu_to_le16(rc->cache +
+                                                                 (u16)(rc->low >> 32));
                        } else {
                        } else {
-                               rc->first = false;
+                               rc->next++;
                        }
                        }
-
                        rc->cache = 0xffff;
                } while (--rc->cache_size != 0);
 
                        rc->cache = 0xffff;
                } while (--rc->cache_size != 0);
 
@@ -377,15 +441,15 @@ lzms_range_encoder_raw_flush(struct lzms_range_encoder_raw *rc)
 {
        for (unsigned i = 0; i < 4; i++)
                lzms_range_encoder_raw_shift_low(rc);
 {
        for (unsigned i = 0; i < 4; i++)
                lzms_range_encoder_raw_shift_low(rc);
-       return !rc->overrun;
+       return rc->next != rc->end;
 }
 
 /* Encode the next bit using the range encoder (raw version).
  *
  * @prob is the chance out of LZMS_PROBABILITY_MAX that the next bit is 0.  */
 }
 
 /* Encode the next bit using the range encoder (raw version).
  *
  * @prob is the chance out of LZMS_PROBABILITY_MAX that the next bit is 0.  */
-static void
-lzms_range_encoder_raw_encode_bit(struct lzms_range_encoder_raw *rc, int bit,
-                                 u32 prob)
+static inline void
+lzms_range_encoder_raw_encode_bit(struct lzms_range_encoder_raw *rc,
+                                 int bit, u32 prob)
 {
        lzms_range_encoder_raw_normalize(rc);
 
 {
        lzms_range_encoder_raw_normalize(rc);
 
@@ -400,7 +464,7 @@ lzms_range_encoder_raw_encode_bit(struct lzms_range_encoder_raw *rc, int bit,
 
 /* Encode a bit using the specified range encoder. This wraps around
  * lzms_range_encoder_raw_encode_bit() to handle using and updating the
 
 /* Encode a bit using the specified range encoder. This wraps around
  * lzms_range_encoder_raw_encode_bit() to handle using and updating the
- * appropriate probability table.  */
+ * appropriate state and probability entry.  */
 static void
 lzms_range_encode_bit(struct lzms_range_encoder *enc, int bit)
 {
 static void
 lzms_range_encode_bit(struct lzms_range_encoder *enc, int bit)
 {
@@ -410,207 +474,197 @@ lzms_range_encode_bit(struct lzms_range_encoder *enc, int bit)
        /* Load the probability entry corresponding to the current state.  */
        prob_entry = &enc->prob_entries[enc->state];
 
        /* Load the probability entry corresponding to the current state.  */
        prob_entry = &enc->prob_entries[enc->state];
 
-       /* Treat the number of zero bits in the most recently encoded
-        * LZMS_PROBABILITY_MAX bits with this probability entry as the chance,
-        * out of LZMS_PROBABILITY_MAX, that the next bit will be a 0.  However,
-        * don't allow 0% or 100% probabilities.  */
-       prob = prob_entry->num_recent_zero_bits;
-       if (prob == 0)
-               prob = 1;
-       else if (prob == LZMS_PROBABILITY_MAX)
-               prob = LZMS_PROBABILITY_MAX - 1;
-
-       /* Encode the next bit.  */
+       /* Update the state based on the next bit.  */
+       enc->state = ((enc->state << 1) | bit) & enc->mask;
+
+       /* Get the probability that the bit is 0.  */
+       prob = lzms_get_probability(prob_entry);
+
+       /* Update the probability entry.  */
+       lzms_update_probability_entry(prob_entry, bit);
+
+       /* Encode the bit.  */
        lzms_range_encoder_raw_encode_bit(enc->rc, bit, prob);
        lzms_range_encoder_raw_encode_bit(enc->rc, bit, prob);
+}
 
 
-       /* Update the state based on the newly encoded bit.  */
-       enc->state = ((enc->state << 1) | bit) & enc->mask;
+/* Called when an adaptive Huffman code needs to be rebuilt.  */
+static void
+lzms_rebuild_huffman_code(struct lzms_huffman_encoder *enc)
+{
+       make_canonical_huffman_code(enc->num_syms,
+                                   LZMS_MAX_CODEWORD_LEN,
+                                   enc->sym_freqs,
+                                   enc->lens,
+                                   enc->codewords);
 
 
-       /* Update the recent bits, including the cached count of 0's.  */
-       BUILD_BUG_ON(LZMS_PROBABILITY_MAX > sizeof(prob_entry->recent_bits) * 8);
-       if (bit == 0) {
-               if (prob_entry->recent_bits & (1ULL << (LZMS_PROBABILITY_MAX - 1))) {
-                       /* Replacing 1 bit with 0 bit; increment the zero count.
-                        */
-                       prob_entry->num_recent_zero_bits++;
-               }
-       } else {
-               if (!(prob_entry->recent_bits & (1ULL << (LZMS_PROBABILITY_MAX - 1)))) {
-                       /* Replacing 0 bit with 1 bit; decrement the zero count.
-                        */
-                       prob_entry->num_recent_zero_bits--;
-               }
+       /* Dilute the frequencies.  */
+       for (unsigned i = 0; i < enc->num_syms; i++) {
+               enc->sym_freqs[i] >>= 1;
+               enc->sym_freqs[i] += 1;
        }
        }
-       prob_entry->recent_bits = (prob_entry->recent_bits << 1) | bit;
+       enc->num_syms_written = 0;
 }
 
 /* Encode a symbol using the specified Huffman encoder.  */
 }
 
 /* Encode a symbol using the specified Huffman encoder.  */
-static void
-lzms_huffman_encode_symbol(struct lzms_huffman_encoder *enc, u32 sym)
+static inline void
+lzms_huffman_encode_symbol(struct lzms_huffman_encoder *enc, unsigned sym)
 {
 {
-       LZMS_ASSERT(sym < enc->num_syms);
-       lzms_output_bitstream_put_bits(enc->os,
-                                      enc->codewords[sym],
-                                      enc->lens[sym]);
+       lzms_output_bitstream_put_varbits(enc->os,
+                                         enc->codewords[sym],
+                                         enc->lens[sym],
+                                         LZMS_MAX_CODEWORD_LEN);
        ++enc->sym_freqs[sym];
        ++enc->sym_freqs[sym];
-       if (++enc->num_syms_written == enc->rebuild_freq) {
-               /* Adaptive code needs to be rebuilt.  */
-               LZMS_DEBUG("Rebuilding code (num_syms=%u)", enc->num_syms);
-               make_canonical_huffman_code(enc->num_syms,
-                                           LZMS_MAX_CODEWORD_LEN,
-                                           enc->sym_freqs,
-                                           enc->lens,
-                                           enc->codewords);
-
-               /* Dilute the frequencies.  */
-               for (unsigned i = 0; i < enc->num_syms; i++) {
-                       enc->sym_freqs[i] >>= 1;
-                       enc->sym_freqs[i] += 1;
-               }
-               enc->num_syms_written = 0;
-       }
+       if (++enc->num_syms_written == enc->rebuild_freq)
+               lzms_rebuild_huffman_code(enc);
 }
 
 static void
 }
 
 static void
-lzms_encode_length(struct lzms_huffman_encoder *enc, u32 length)
+lzms_update_fast_length_costs(struct lzms_compressor *c);
+
+/* Encode a match length.  */
+static void
+lzms_encode_length(struct lzms_compressor *c, u32 length)
 {
        unsigned slot;
        unsigned num_extra_bits;
        u32 extra_bits;
 
 {
        unsigned slot;
        unsigned num_extra_bits;
        u32 extra_bits;
 
-       slot = lzms_get_length_slot(length);
+       slot = lzms_get_length_slot_fast(c, length);
 
 
+       extra_bits = length - lzms_length_slot_base[slot];
        num_extra_bits = lzms_extra_length_bits[slot];
 
        num_extra_bits = lzms_extra_length_bits[slot];
 
-       extra_bits = length - lzms_length_slot_base[slot];
+       lzms_huffman_encode_symbol(&c->length_encoder, slot);
+       if (c->length_encoder.num_syms_written == 0)
+               lzms_update_fast_length_costs(c);
 
 
-       lzms_huffman_encode_symbol(enc, slot);
-       lzms_output_bitstream_put_bits(enc->os, extra_bits, num_extra_bits);
+       lzms_output_bitstream_put_varbits(c->length_encoder.os,
+                                         extra_bits, num_extra_bits, 30);
 }
 
 }
 
+/* Encode an LZ match offset.  */
 static void
 static void
-lzms_encode_offset(struct lzms_huffman_encoder *enc, u32 offset)
+lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset)
 {
        unsigned slot;
        unsigned num_extra_bits;
        u32 extra_bits;
 
 {
        unsigned slot;
        unsigned num_extra_bits;
        u32 extra_bits;
 
-       slot = lzms_get_position_slot(offset);
-
-       num_extra_bits = lzms_extra_position_bits[slot];
+       slot = lzms_get_offset_slot_fast(c, offset);
 
 
-       extra_bits = offset - lzms_position_slot_base[slot];
+       extra_bits = offset - lzms_offset_slot_base[slot];
+       num_extra_bits = lzms_extra_offset_bits[slot];
 
 
-       lzms_huffman_encode_symbol(enc, slot);
-       lzms_output_bitstream_put_bits(enc->os, extra_bits, num_extra_bits);
-}
-
-static void
-lzms_begin_encode_item(struct lzms_compressor *ctx)
-{
-       ctx->lru.lz.upcoming_offset = 0;
-       ctx->lru.delta.upcoming_offset = 0;
-       ctx->lru.delta.upcoming_power = 0;
-}
-
-static void
-lzms_end_encode_item(struct lzms_compressor *ctx, u32 length)
-{
-       LZMS_ASSERT(ctx->window_size - ctx->cur_window_pos >= length);
-       ctx->cur_window_pos += length;
-       lzms_update_lru_queues(&ctx->lru);
+       lzms_huffman_encode_symbol(&c->lz_offset_encoder, slot);
+       lzms_output_bitstream_put_varbits(c->lz_offset_encoder.os,
+                                         extra_bits, num_extra_bits, 30);
 }
 
 /* Encode a literal byte.  */
 static void
 }
 
 /* Encode a literal byte.  */
 static void
-lzms_encode_literal(struct lzms_compressor *ctx, u8 literal)
+lzms_encode_literal(struct lzms_compressor *c, unsigned literal)
 {
 {
-       LZMS_DEBUG("Position %u: Encoding literal 0x%02x ('%c')",
-                  ctx->cur_window_pos, literal, literal);
-
-       lzms_begin_encode_item(ctx);
-
        /* Main bit: 0 = a literal, not a match.  */
        /* Main bit: 0 = a literal, not a match.  */
-       lzms_range_encode_bit(&ctx->main_range_encoder, 0);
+       lzms_range_encode_bit(&c->main_range_encoder, 0);
 
        /* Encode the literal using the current literal Huffman code.  */
 
        /* Encode the literal using the current literal Huffman code.  */
-       lzms_huffman_encode_symbol(&ctx->literal_encoder, literal);
-
-       lzms_end_encode_item(ctx, 1);
+       lzms_huffman_encode_symbol(&c->literal_encoder, literal);
 }
 
 }
 
-/* Encode a (length, offset) pair (LZ match).  */
+/* Encode an LZ repeat offset match.  */
 static void
 static void
-lzms_encode_lz_match(struct lzms_compressor *ctx, u32 length, u32 offset)
+lzms_encode_lz_repeat_offset_match(struct lzms_compressor *c,
+                                  u32 length, unsigned rep_index)
 {
 {
-       int recent_offset_idx;
-
-       LZMS_DEBUG("Position %u: Encoding LZ match {length=%u, offset=%u}",
-                  ctx->cur_window_pos, length, offset);
-
-       LZMS_ASSERT(length <= ctx->window_size - ctx->cur_window_pos);
-       LZMS_ASSERT(offset <= ctx->cur_window_pos);
-       LZMS_ASSERT(!memcmp(&ctx->window[ctx->cur_window_pos],
-                           &ctx->window[ctx->cur_window_pos - offset],
-                           length));
-
-       lzms_begin_encode_item(ctx);
+       unsigned i;
 
        /* Main bit: 1 = a match, not a literal.  */
 
        /* Main bit: 1 = a match, not a literal.  */
-       lzms_range_encode_bit(&ctx->main_range_encoder, 1);
+       lzms_range_encode_bit(&c->main_range_encoder, 1);
 
        /* Match bit: 0 = an LZ match, not a delta match.  */
 
        /* Match bit: 0 = an LZ match, not a delta match.  */
-       lzms_range_encode_bit(&ctx->match_range_encoder, 0);
+       lzms_range_encode_bit(&c->match_range_encoder, 0);
 
 
-       /* Determine if the offset can be represented as a recent offset.  */
-       for (recent_offset_idx = 0;
-            recent_offset_idx < LZMS_NUM_RECENT_OFFSETS;
-            recent_offset_idx++)
-               if (offset == ctx->lru.lz.recent_offsets[recent_offset_idx])
-                       break;
+       /* LZ match bit: 1 = repeat offset, not an explicit offset.  */
+       lzms_range_encode_bit(&c->lz_match_range_encoder, 1);
 
 
-       if (recent_offset_idx == LZMS_NUM_RECENT_OFFSETS) {
-               /* Explicit offset.  */
+       /* Encode the repeat offset index.  A 1 bit is encoded for each index
+        * passed up.  This sequence of 1 bits is terminated by a 0 bit, or
+        * automatically when (LZMS_NUM_RECENT_OFFSETS - 1) 1 bits have been
+        * encoded.  */
+       for (i = 0; i < rep_index; i++)
+               lzms_range_encode_bit(&c->lz_repeat_match_range_encoders[i], 1);
 
 
-               /* LZ match bit: 0 = explicit offset, not a recent offset.  */
-               lzms_range_encode_bit(&ctx->lz_match_range_encoder, 0);
+       if (i < LZMS_NUM_RECENT_OFFSETS - 1)
+               lzms_range_encode_bit(&c->lz_repeat_match_range_encoders[i], 0);
 
 
-               /* Encode the match offset.  */
-               lzms_encode_offset(&ctx->lz_offset_encoder, offset);
-       } else {
-               int i;
-
-               /* Recent offset.  */
+       /* Encode the match length.  */
+       lzms_encode_length(c, length);
+}
 
 
-               /* LZ match bit: 1 = recent offset, not an explicit offset.  */
-               lzms_range_encode_bit(&ctx->lz_match_range_encoder, 1);
+/* Encode an LZ explicit offset match.  */
+static void
+lzms_encode_lz_explicit_offset_match(struct lzms_compressor *c,
+                                    u32 length, u32 offset)
+{
+       /* Main bit: 1 = a match, not a literal.  */
+       lzms_range_encode_bit(&c->main_range_encoder, 1);
 
 
-               /* Encode the recent offset index.  A 1 bit is encoded for each
-                * index passed up.  This sequence of 1 bits is terminated by a
-                * 0 bit, or automatically when (LZMS_NUM_RECENT_OFFSETS - 1) 1
-                * bits have been encoded.  */
-               for (i = 0; i < recent_offset_idx; i++)
-                       lzms_range_encode_bit(&ctx->lz_repeat_match_range_encoders[i], 1);
+       /* Match bit: 0 = an LZ match, not a delta match.  */
+       lzms_range_encode_bit(&c->match_range_encoder, 0);
 
 
-               if (i < LZMS_NUM_RECENT_OFFSETS - 1)
-                       lzms_range_encode_bit(&ctx->lz_repeat_match_range_encoders[i], 0);
+       /* LZ match bit: 0 = explicit offset, not a repeat offset.  */
+       lzms_range_encode_bit(&c->lz_match_range_encoder, 0);
 
 
-               /* Initial update of the LZ match offset LRU queue.  */
-               for (; i < LZMS_NUM_RECENT_OFFSETS; i++)
-                       ctx->lru.lz.recent_offsets[i] = ctx->lru.lz.recent_offsets[i + 1];
-       }
+       /* Encode the match offset.  */
+       lzms_encode_lz_offset(c, offset);
 
        /* Encode the match length.  */
 
        /* Encode the match length.  */
-       lzms_encode_length(&ctx->length_encoder, length);
+       lzms_encode_length(c, length);
+}
 
 
-       /* Save the match offset for later insertion at the front of the LZ
-        * match offset LRU queue.  */
-       ctx->lru.lz.upcoming_offset = offset;
+static void
+lzms_encode_item(struct lzms_compressor *c, u64 mc_item_data)
+{
+       u32 len = mc_item_data & MC_LEN_MASK;
+       u32 offset_data = mc_item_data >> MC_OFFSET_SHIFT;
 
 
-       lzms_end_encode_item(ctx, length);
+       if (len == 1)
+               lzms_encode_literal(c, offset_data);
+       else if (offset_data < LZMS_NUM_RECENT_OFFSETS)
+               lzms_encode_lz_repeat_offset_match(c, len, offset_data);
+       else
+               lzms_encode_lz_explicit_offset_match(c, len, offset_data - LZMS_OFFSET_OFFSET);
 }
 
 }
 
-#define LZMS_COST_SHIFT 5
+/* Encode a list of matches and literals chosen by the parsing algorithm.  */
+static void
+lzms_encode_item_list(struct lzms_compressor *c,
+                     struct lzms_mc_pos_data *cur_optimum_ptr)
+{
+       struct lzms_mc_pos_data *end_optimum_ptr;
+       u64 saved_item;
+       u64 item;
+
+       /* The list is currently in reverse order (last item to first item).
+        * Reverse it.  */
+       end_optimum_ptr = cur_optimum_ptr;
+       saved_item = cur_optimum_ptr->mc_item_data;
+       do {
+               item = saved_item;
+               cur_optimum_ptr -= item & MC_LEN_MASK;
+               saved_item = cur_optimum_ptr->mc_item_data;
+               cur_optimum_ptr->mc_item_data = item;
+       } while (cur_optimum_ptr != c->optimum);
+
+       /* Walk the list of items from beginning to end, encoding each item.  */
+       do {
+               lzms_encode_item(c, cur_optimum_ptr->mc_item_data);
+               cur_optimum_ptr += (cur_optimum_ptr->mc_item_data) & MC_LEN_MASK;
+       } while (cur_optimum_ptr != end_optimum_ptr);
+}
+
+/* Each bit costs 1 << LZMS_COST_SHIFT units.  */
+#define LZMS_COST_SHIFT 6
 
 /*#define LZMS_RC_COSTS_USE_FLOATING_POINT*/
 
 
 /*#define LZMS_RC_COSTS_USE_FLOATING_POINT*/
 
@@ -681,24 +735,14 @@ lzms_init_rc_costs(void)
        pthread_once(&once, lzms_do_init_rc_costs);
 }
 
        pthread_once(&once, lzms_do_init_rc_costs);
 }
 
-/*
- * Return the cost to range-encode the specified bit when in the specified
- * state.
- *
- * @enc                The range encoder to use.
- * @cur_state  Current state, which indicates the probability entry to choose.
- *             Updated by this function.
- * @bit                The bit to encode (0 or 1).
- */
-static u32
-lzms_rc_bit_cost(const struct lzms_range_encoder *enc, u8 *cur_state, int bit)
+/* Return the cost to range-encode the specified bit from the specified state.*/
+static inline u32
+lzms_rc_bit_cost(const struct lzms_range_encoder *enc, u8 cur_state, int bit)
 {
        u32 prob_zero;
        u32 prob_correct;
 
 {
        u32 prob_zero;
        u32 prob_correct;
 
-       prob_zero = enc->prob_entries[*cur_state & enc->mask].num_recent_zero_bits;
-
-       *cur_state = (*cur_state << 1) | bit;
+       prob_zero = enc->prob_entries[cur_state].num_recent_zero_bits;
 
        if (bit == 0)
                prob_correct = prob_zero;
 
        if (bit == 0)
                prob_correct = prob_zero;
@@ -708,444 +752,487 @@ lzms_rc_bit_cost(const struct lzms_range_encoder *enc, u8 *cur_state, int bit)
        return lzms_rc_costs[prob_correct];
 }
 
        return lzms_rc_costs[prob_correct];
 }
 
-static u32
-lzms_huffman_symbol_cost(const struct lzms_huffman_encoder *enc, u32 sym)
+/* Return the cost to Huffman-encode the specified symbol.  */
+static inline u32
+lzms_huffman_symbol_cost(const struct lzms_huffman_encoder *enc, unsigned sym)
 {
 {
-       return enc->lens[sym] << LZMS_COST_SHIFT;
+       return (u32)enc->lens[sym] << LZMS_COST_SHIFT;
 }
 
 }
 
-static u32
-lzms_offset_cost(const struct lzms_huffman_encoder *enc, u32 offset)
+/* Return the cost to encode the specified literal byte.  */
+static inline u32
+lzms_literal_cost(const struct lzms_compressor *c, unsigned literal,
+                 const struct lzms_adaptive_state *state)
 {
 {
-       u32 slot;
-       u32 num_extra_bits;
-       u32 cost = 0;
-
-       slot = lzms_get_position_slot(offset);
-
-       cost += lzms_huffman_symbol_cost(enc, slot);
-
-       num_extra_bits = lzms_extra_position_bits[slot];
-
-       cost += num_extra_bits << LZMS_COST_SHIFT;
-
-       return cost;
+       return lzms_rc_bit_cost(&c->main_range_encoder, state->main_state, 0) +
+              lzms_huffman_symbol_cost(&c->literal_encoder, literal);
 }
 
 }
 
-static u32
-lzms_get_length_cost(const struct lzms_huffman_encoder *enc, u32 length)
+/* Update the table that directly provides the costs for small lengths.  */
+static void
+lzms_update_fast_length_costs(struct lzms_compressor *c)
 {
 {
-       u32 slot;
-       u32 num_extra_bits;
+       u32 len;
+       int slot = -1;
        u32 cost = 0;
 
        u32 cost = 0;
 
-       slot = lzms_get_length_slot(length);
+       for (len = 1; len < LZMS_NUM_FAST_LENGTHS; len++) {
 
 
-       cost += lzms_huffman_symbol_cost(enc, slot);
-
-       num_extra_bits = lzms_extra_length_bits[slot];
-
-       cost += num_extra_bits << LZMS_COST_SHIFT;
+               while (len >= lzms_length_slot_base[slot + 1]) {
+                       slot++;
+                       cost = (u32)(c->length_encoder.lens[slot] +
+                                    lzms_extra_length_bits[slot]) << LZMS_COST_SHIFT;
+               }
 
 
-       return cost;
+               c->length_cost_fast[len] = cost;
+       }
 }
 
 }
 
-static u32
-lzms_get_matches(struct lzms_compressor *ctx, struct lz_match **matches_ret)
+/* Return the cost to encode the specified match length, which must be less than
+ * LZMS_NUM_FAST_LENGTHS.  */
+static inline u32
+lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
 {
 {
-       *matches_ret = ctx->matches;
-       return lz_mf_get_matches(ctx->mf, ctx->matches);
+       LZMS_ASSERT(length < LZMS_NUM_FAST_LENGTHS);
+       return c->length_cost_fast[length];
 }
 
 }
 
-static void
-lzms_skip_bytes(struct lzms_compressor *ctx, u32 n)
+/* Return the cost to encode the specified LZ match offset.  */
+static inline u32
+lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
 {
 {
-       lz_mf_skip_positions(ctx->mf, n);
+       unsigned slot = lzms_get_offset_slot_fast(c, offset);
+
+       return (u32)(c->lz_offset_encoder.lens[slot] +
+                    lzms_extra_offset_bits[slot]) << LZMS_COST_SHIFT;
 }
 
 }
 
-static u32
-lzms_get_literal_cost(struct lzms_compressor *ctx,
-                     struct lzms_adaptive_state *state, u8 literal)
+/*
+ * Consider coding the match at repeat offset index @rep_idx.  Consider each
+ * length from the minimum (2) to the full match length (@rep_len).
+ */
+static inline void
+lzms_consider_lz_repeat_offset_match(const struct lzms_compressor *c,
+                                    struct lzms_mc_pos_data *cur_optimum_ptr,
+                                    u32 rep_len, unsigned rep_idx)
 {
 {
-       u32 cost = 0;
-
-       state->lru.upcoming_offset = 0;
-       lzms_update_lz_lru_queues(&state->lru);
+       u32 len;
+       u32 base_cost;
+       u32 cost;
+       unsigned i;
 
 
-       cost += lzms_rc_bit_cost(&ctx->main_range_encoder,
-                                &state->main_state, 0);
+       base_cost = cur_optimum_ptr->cost;
 
 
-       cost += lzms_huffman_symbol_cost(&ctx->literal_encoder, literal);
+       base_cost += lzms_rc_bit_cost(&c->main_range_encoder,
+                                     cur_optimum_ptr->state.main_state, 1);
 
 
-       return cost;
-}
+       base_cost += lzms_rc_bit_cost(&c->match_range_encoder,
+                                     cur_optimum_ptr->state.match_state, 0);
 
 
-static u32
-lzms_get_lz_match_cost_nolen(struct lzms_compressor *ctx,
-                            struct lzms_adaptive_state *state, u32 offset)
-{
-       u32 cost = 0;
-       int recent_offset_idx;
+       base_cost += lzms_rc_bit_cost(&c->lz_match_range_encoder,
+                                     cur_optimum_ptr->state.lz_match_state, 1);
 
 
-       cost += lzms_rc_bit_cost(&ctx->main_range_encoder,
-                                &state->main_state, 1);
-       cost += lzms_rc_bit_cost(&ctx->match_range_encoder,
-                                &state->match_state, 0);
+       for (i = 0; i < rep_idx; i++)
+               base_cost += lzms_rc_bit_cost(&c->lz_repeat_match_range_encoders[i],
+                                             cur_optimum_ptr->state.lz_repeat_match_state[i], 1);
 
 
-       for (recent_offset_idx = 0;
-            recent_offset_idx < LZMS_NUM_RECENT_OFFSETS;
-            recent_offset_idx++)
-               if (offset == state->lru.recent_offsets[recent_offset_idx])
-                       break;
+       if (i < LZMS_NUM_RECENT_OFFSETS - 1)
+               base_cost += lzms_rc_bit_cost(&c->lz_repeat_match_range_encoders[i],
+                                             cur_optimum_ptr->state.lz_repeat_match_state[i], 0);
 
 
-       if (recent_offset_idx == LZMS_NUM_RECENT_OFFSETS) {
-               /* Explicit offset.  */
-               cost += lzms_rc_bit_cost(&ctx->lz_match_range_encoder,
-                                        &state->lz_match_state, 0);
+       len = 2;
+       do {
+               cost = base_cost + lzms_fast_length_cost(c, len);
+               if (cost < (cur_optimum_ptr + len)->cost) {
+                       (cur_optimum_ptr + len)->mc_item_data =
+                               ((u64)rep_idx << MC_OFFSET_SHIFT) | len;
+                       (cur_optimum_ptr + len)->cost = cost;
+               }
+       } while (++len <= rep_len);
+}
 
 
-               cost += lzms_offset_cost(&ctx->lz_offset_encoder, offset);
-       } else {
-               int i;
+/*
+ * Consider coding each match in @matches as an explicit offset match.
+ *
+ * @matches must be sorted by strictly increasing length and strictly increasing
+ * offset.  This is guaranteed by the match-finder.
+ *
+ * We consider each length from the minimum (2) to the longest
+ * (matches[num_matches - 1].len).  For each length, we consider only the
+ * smallest offset for which that length is available.  Although this is not
+ * guaranteed to be optimal due to the possibility of a larger offset costing
+ * less than a smaller offset to code, this is a very useful heuristic.
+ */
+static inline void
+lzms_consider_lz_explicit_offset_matches(const struct lzms_compressor *c,
+                                        struct lzms_mc_pos_data *cur_optimum_ptr,
+                                        const struct lz_match matches[],
+                                        u32 num_matches)
+{
+       u32 len;
+       u32 i;
+       u32 base_cost;
+       u32 position_cost;
+       u32 cost;
 
 
-               /* Recent offset.  */
-               cost += lzms_rc_bit_cost(&ctx->lz_match_range_encoder,
-                                        &state->lz_match_state, 1);
+       base_cost = cur_optimum_ptr->cost;
 
 
-               for (i = 0; i < recent_offset_idx; i++)
-                       cost += lzms_rc_bit_cost(&ctx->lz_repeat_match_range_encoders[i],
-                                                &state->lz_repeat_match_state[i], 0);
+       base_cost += lzms_rc_bit_cost(&c->main_range_encoder,
+                                     cur_optimum_ptr->state.main_state, 1);
 
 
-               if (i < LZMS_NUM_RECENT_OFFSETS - 1)
-                       cost += lzms_rc_bit_cost(&ctx->lz_repeat_match_range_encoders[i],
-                                                &state->lz_repeat_match_state[i], 1);
+       base_cost += lzms_rc_bit_cost(&c->match_range_encoder,
+                                     cur_optimum_ptr->state.match_state, 0);
 
 
+       base_cost += lzms_rc_bit_cost(&c->lz_match_range_encoder,
+                                     cur_optimum_ptr->state.lz_match_state, 0);
+       len = 2;
+       i = 0;
+       do {
+               position_cost = base_cost + lzms_lz_offset_cost(c,
+                                                               matches[i].offset);
+               do {
+                       cost = position_cost + lzms_fast_length_cost(c, len);
+                       if (cost < (cur_optimum_ptr + len)->cost) {
+                               (cur_optimum_ptr + len)->mc_item_data =
+                                       ((u64)(matches[i].offset + LZMS_OFFSET_OFFSET)
+                                               << MC_OFFSET_SHIFT) | len;
+                               (cur_optimum_ptr + len)->cost = cost;
+                       }
+               } while (++len <= matches[i].len);
+       } while (++i != num_matches);
+}
 
 
-               /* Initial update of the LZ match offset LRU queue.  */
-               for (; i < LZMS_NUM_RECENT_OFFSETS; i++)
-                       state->lru.recent_offsets[i] = state->lru.recent_offsets[i + 1];
-       }
+static void
+lzms_init_adaptive_state(struct lzms_adaptive_state *state)
+{
+       unsigned i;
+
+       lzms_init_lz_lru_queues(&state->lru);
+       state->main_state = 0;
+       state->match_state = 0;
+       state->lz_match_state = 0;
+       for (i = 0; i < LZMS_NUM_RECENT_OFFSETS - 1; i++)
+               state->lz_repeat_match_state[i] = 0;
+}
 
 
+static inline void
+lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
+{
+       state->main_state = ((state->main_state << 1) | is_match) % LZMS_NUM_MAIN_STATES;
+}
 
 
-       state->lru.upcoming_offset = offset;
-       lzms_update_lz_lru_queues(&state->lru);
+static inline void
+lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
+{
+       state->match_state = ((state->match_state << 1) | is_delta) % LZMS_NUM_MATCH_STATES;
+}
 
 
-       return cost;
+static inline void
+lzms_update_lz_match_state(struct lzms_adaptive_state *state, int is_repeat_offset)
+{
+       state->lz_match_state = ((state->lz_match_state << 1) | is_repeat_offset) % LZMS_NUM_LZ_MATCH_STATES;
 }
 
 }
 
-static u32
-lzms_get_lz_match_cost(struct lzms_compressor *ctx,
-                      struct lzms_adaptive_state *state,
-                      u32 length, u32 offset)
+static inline void
+lzms_update_lz_repeat_match_state(struct lzms_adaptive_state *state, int rep_idx)
 {
 {
-       return lzms_get_lz_match_cost_nolen(ctx, state, offset) +
-              lzms_get_length_cost(&ctx->length_encoder, length);
+       int i;
+
+       for (i = 0; i < rep_idx; i++)
+               state->lz_repeat_match_state[i] =
+                       ((state->lz_repeat_match_state[i] << 1) | 1) %
+                               LZMS_NUM_LZ_REPEAT_MATCH_STATES;
+
+       if (i < LZMS_NUM_RECENT_OFFSETS - 1)
+               state->lz_repeat_match_state[i] =
+                       ((state->lz_repeat_match_state[i] << 1) | 0) %
+                               LZMS_NUM_LZ_REPEAT_MATCH_STATES;
 }
 
 }
 
-static inline u32
-lzms_repsearch(const u8 * const strptr, const u32 bytes_remaining,
-              const struct lzms_lz_lru_queues *queue, u32 *offset_ret)
+/*
+ * The main near-optimal parsing routine.
+ *
+ * Briefly, the algorithm does an approximate minimum-cost path search to find a
+ * "near-optimal" sequence of matches and literals to output, based on the
+ * current cost model.  The algorithm steps forward, position by position (byte
+ * by byte), and updates the minimum cost path to reach each later position that
+ * can be reached using a match or literal from the current position.  This is
+ * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
+ * the graph edges are possible matches/literals to code, and the cost of each
+ * edge is the estimated number of bits that will be required to output the
+ * corresponding match or literal.  But one difference is that we actually
+ * compute the lowest-cost path in pieces, where each piece is terminated when
+ * there are no choices to be made.
+ *
+ * Notes:
+ *
+ * - This does not output any delta matches.
+ *
+ * - The costs of literals and matches are estimated using the range encoder
+ *   states and the semi-adaptive Huffman codes.  Except for range encoding
+ *   states, costs are assumed to be constant throughout a single run of the
+ *   parsing algorithm, which can parse up to @optim_array_length bytes of data.
+ *   This introduces a source of inaccuracy because the probabilities and
+ *   Huffman codes can change over this part of the data.
+ */
+static void
+lzms_near_optimal_parse(struct lzms_compressor *c)
 {
 {
+       const u8 *window_ptr;
+       const u8 *window_end;
+       struct lzms_mc_pos_data *cur_optimum_ptr;
+       struct lzms_mc_pos_data *end_optimum_ptr;
+       u32 num_matches;
+       u32 longest_len;
+       u32 rep_max_len;
+       unsigned rep_max_idx;
+       unsigned literal;
+       unsigned i;
+       u32 cost;
        u32 len;
        u32 len;
-       unsigned slot = 0;
+       u32 offset_data;
 
 
-       len = lz_repsearch(strptr, bytes_remaining, UINT32_MAX,
-                          queue->recent_offsets, LZMS_NUM_RECENT_OFFSETS, &slot);
-       *offset_ret = queue->recent_offsets[slot];
-       return len;
-}
+       window_ptr = c->cur_window;
+       window_end = window_ptr + c->cur_window_size;
 
 
+       lzms_init_adaptive_state(&c->optimum[0].state);
 
 
-static struct lz_match
-lzms_match_chooser_reverse_list(struct lzms_compressor *ctx, unsigned cur_pos)
-{
-       unsigned prev_link, saved_prev_link;
-       unsigned prev_match_offset, saved_prev_match_offset;
+begin:
+       /* Start building a new list of items, which will correspond to the next
+        * piece of the overall minimum-cost path.  */
 
 
-       ctx->optimum_end_idx = cur_pos;
+       cur_optimum_ptr = c->optimum;
+       cur_optimum_ptr->cost = 0;
+       end_optimum_ptr = cur_optimum_ptr;
 
 
-       saved_prev_link = ctx->optimum[cur_pos].prev.link;
-       saved_prev_match_offset = ctx->optimum[cur_pos].prev.match_offset;
+       /* States should currently be consistent with the encoders.  */
+       LZMS_ASSERT(cur_optimum_ptr->state.main_state == c->main_range_encoder.state);
+       LZMS_ASSERT(cur_optimum_ptr->state.match_state == c->match_range_encoder.state);
+       LZMS_ASSERT(cur_optimum_ptr->state.lz_match_state == c->lz_match_range_encoder.state);
+       for (i = 0; i < LZMS_NUM_RECENT_OFFSETS - 1; i++)
+               LZMS_ASSERT(cur_optimum_ptr->state.lz_repeat_match_state[i] ==
+                           c->lz_repeat_match_range_encoders[i].state);
 
 
-       do {
-               prev_link = saved_prev_link;
-               prev_match_offset = saved_prev_match_offset;
+       if (window_ptr == window_end)
+               return;
 
 
-               saved_prev_link = ctx->optimum[prev_link].prev.link;
-               saved_prev_match_offset = ctx->optimum[prev_link].prev.match_offset;
+       /* The following loop runs once for each per byte in the window, except
+        * in a couple shortcut cases.  */
+       for (;;) {
 
 
-               ctx->optimum[prev_link].next.link = cur_pos;
-               ctx->optimum[prev_link].next.match_offset = prev_match_offset;
+               /* Find explicit offset matches with the current position.  */
+               num_matches = lz_mf_get_matches(c->mf, c->matches);
 
 
-               cur_pos = prev_link;
-       } while (cur_pos != 0);
+               if (num_matches) {
+                       /*
+                        * Find the longest repeat offset match with the current
+                        * position.
+                        *
+                        * Heuristics:
+                        *
+                        * - Only search for repeat offset matches if the
+                        *   match-finder already found at least one match.
+                        *
+                        * - Only consider the longest repeat offset match.  It
+                        *   seems to be rare for the optimal parse to include a
+                        *   repeat offset match that doesn't have the longest
+                        *   length (allowing for the possibility that not all
+                        *   of that length is actually used).
+                        */
+                       if (likely(window_ptr - c->cur_window >= LZMS_MAX_INIT_RECENT_OFFSET)) {
+                               BUILD_BUG_ON(LZMS_NUM_RECENT_OFFSETS != 3);
+                               rep_max_len = lz_repsearch3(window_ptr,
+                                                           window_end - window_ptr,
+                                                           cur_optimum_ptr->state.lru.recent_offsets,
+                                                           &rep_max_idx);
+                       } else {
+                               rep_max_len = 0;
+                       }
 
 
-       ctx->optimum_cur_idx = ctx->optimum[0].next.link;
+                       if (rep_max_len) {
+                               /* If there's a very long repeat offset match,
+                                * choose it immediately.  */
+                               if (rep_max_len >= c->params.nice_match_length) {
 
 
-       return (struct lz_match)
-               { .len = ctx->optimum_cur_idx,
-                 .offset = ctx->optimum[0].next.match_offset,
-               };
-}
+                                       lz_mf_skip_positions(c->mf, rep_max_len - 1);
+                                       window_ptr += rep_max_len;
 
 
-/* This is similar to lzx_choose_near_optimal_item() in lzx-compress.c.
- * Read that one if you want to understand it.  */
-static struct lz_match
-lzms_get_near_optimal_item(struct lzms_compressor *ctx)
-{
-       u32 num_matches;
-       struct lz_match *matches;
-       struct lz_match match;
-       u32 longest_len;
-       u32 longest_rep_len;
-       u32 longest_rep_offset;
-       unsigned cur_pos;
-       unsigned end_pos;
-       struct lzms_adaptive_state initial_state;
-
-       if (ctx->optimum_cur_idx != ctx->optimum_end_idx) {
-               match.len = ctx->optimum[ctx->optimum_cur_idx].next.link -
-                                   ctx->optimum_cur_idx;
-               match.offset = ctx->optimum[ctx->optimum_cur_idx].next.match_offset;
-
-               ctx->optimum_cur_idx = ctx->optimum[ctx->optimum_cur_idx].next.link;
-               return match;
-       }
+                                       if (cur_optimum_ptr != c->optimum)
+                                               lzms_encode_item_list(c, cur_optimum_ptr);
 
 
-       ctx->optimum_cur_idx = 0;
-       ctx->optimum_end_idx = 0;
+                                       lzms_encode_lz_repeat_offset_match(c, rep_max_len,
+                                                                          rep_max_idx);
 
 
-       if (lz_mf_get_position(ctx->mf) >= LZMS_MAX_INIT_RECENT_OFFSET) {
-               longest_rep_len = lzms_repsearch(lz_mf_get_window_ptr(ctx->mf),
-                                                lz_mf_get_bytes_remaining(ctx->mf),
-                                                &ctx->lru.lz, &longest_rep_offset);
-       } else {
-               longest_rep_len = 0;
-       }
+                                       c->optimum[0].state = cur_optimum_ptr->state;
 
 
-       if (longest_rep_len >= ctx->params.nice_match_length) {
-               lzms_skip_bytes(ctx, longest_rep_len);
-               return (struct lz_match) {
-                       .len = longest_rep_len,
-                       .offset = longest_rep_offset,
-               };
-       }
+                                       lzms_update_main_state(&c->optimum[0].state, 1);
+                                       lzms_update_match_state(&c->optimum[0].state, 0);
+                                       lzms_update_lz_match_state(&c->optimum[0].state, 1);
+                                       lzms_update_lz_repeat_match_state(&c->optimum[0].state,
+                                                                         rep_max_idx);
 
 
-       num_matches = lzms_get_matches(ctx, &matches);
+                                       c->optimum[0].state.lru.upcoming_offset =
+                                               c->optimum[0].state.lru.recent_offsets[rep_max_idx];
 
 
-       if (num_matches) {
-               longest_len = matches[num_matches - 1].len;
-               if (longest_len >= ctx->params.nice_match_length) {
-                       lzms_skip_bytes(ctx, longest_len - 1);
-                       return matches[num_matches - 1];
-               }
-       } else {
-               longest_len = 1;
-       }
+                                       for (i = rep_max_idx; i < LZMS_NUM_RECENT_OFFSETS; i++)
+                                               c->optimum[0].state.lru.recent_offsets[i] =
+                                                       c->optimum[0].state.lru.recent_offsets[i + 1];
 
 
-       initial_state.lru = ctx->lru.lz;
-       initial_state.main_state = ctx->main_range_encoder.state;
-       initial_state.match_state = ctx->match_range_encoder.state;
-       initial_state.lz_match_state = ctx->lz_match_range_encoder.state;
-       for (int i = 0; i < LZMS_NUM_RECENT_OFFSETS - 1; i++)
-               initial_state.lz_repeat_match_state[i] = ctx->lz_repeat_match_range_encoders[i].state;
+                                       lzms_update_lz_lru_queue(&c->optimum[0].state.lru);
+                                       goto begin;
+                               }
 
 
-       ctx->optimum[1].state = initial_state;
-       ctx->optimum[1].cost = lzms_get_literal_cost(ctx,
-                                                    &ctx->optimum[1].state,
-                                                    *(lz_mf_get_window_ptr(ctx->mf) - 1));
-       ctx->optimum[1].prev.link = 0;
+                               /* If reaching any positions for the first time,
+                                * initialize their costs to "infinity".  */
+                               while (end_optimum_ptr < cur_optimum_ptr + rep_max_len)
+                                       (++end_optimum_ptr)->cost = MC_INFINITE_COST;
 
 
-       for (u32 i = 0, len = 2; i < num_matches; i++) {
-               u32 offset = matches[i].offset;
-               struct lzms_adaptive_state state;
-               u32 position_cost;
+                               /* Consider coding a repeat offset match.  */
+                               lzms_consider_lz_repeat_offset_match(c, cur_optimum_ptr,
+                                                                    rep_max_len, rep_max_idx);
+                       }
 
 
-               state = initial_state;
-               position_cost = 0;
-               position_cost += lzms_get_lz_match_cost_nolen(ctx, &state, offset);
+                       longest_len = c->matches[num_matches - 1].len;
 
 
-               do {
-                       u32 cost;
+                       /* If there's a very long explicit offset match, choose
+                        * it immediately.  */
+                       if (longest_len >= c->params.nice_match_length) {
 
 
-                       cost = position_cost;
-                       cost += lzms_get_length_cost(&ctx->length_encoder, len);
+                               lz_mf_skip_positions(c->mf, longest_len - 1);
+                               window_ptr += longest_len;
 
 
-                       ctx->optimum[len].state = state;
-                       ctx->optimum[len].prev.link = 0;
-                       ctx->optimum[len].prev.match_offset = offset;
-                       ctx->optimum[len].cost = cost;
-               } while (++len <= matches[i].len);
-       }
-       end_pos = longest_len;
-
-       if (longest_rep_len) {
-               struct lzms_adaptive_state state;
-               u32 cost;
-
-               while (end_pos < longest_rep_len)
-                       ctx->optimum[++end_pos].cost = MC_INFINITE_COST;
-
-               state = initial_state;
-               cost = lzms_get_lz_match_cost(ctx,
-                                             &state,
-                                             longest_rep_len,
-                                             longest_rep_offset);
-               if (cost <= ctx->optimum[longest_rep_len].cost) {
-                       ctx->optimum[longest_rep_len].state = state;
-                       ctx->optimum[longest_rep_len].prev.link = 0;
-                       ctx->optimum[longest_rep_len].prev.match_offset = longest_rep_offset;
-                       ctx->optimum[longest_rep_len].cost = cost;
-               }
-       }
+                               if (cur_optimum_ptr != c->optimum)
+                                       lzms_encode_item_list(c, cur_optimum_ptr);
 
 
-       cur_pos = 0;
-       for (;;) {
-               u32 cost;
-               struct lzms_adaptive_state state;
+                               lzms_encode_lz_explicit_offset_match(c, longest_len,
+                                                                    c->matches[num_matches - 1].offset);
 
 
-               cur_pos++;
+                               c->optimum[0].state = cur_optimum_ptr->state;
 
 
-               if (cur_pos == end_pos || cur_pos == ctx->params.optim_array_length)
-                       return lzms_match_chooser_reverse_list(ctx, cur_pos);
+                               lzms_update_main_state(&c->optimum[0].state, 1);
+                               lzms_update_match_state(&c->optimum[0].state, 0);
+                               lzms_update_lz_match_state(&c->optimum[0].state, 0);
 
 
-               if (lz_mf_get_position(ctx->mf) >= LZMS_MAX_INIT_RECENT_OFFSET) {
-                       longest_rep_len = lzms_repsearch(lz_mf_get_window_ptr(ctx->mf),
-                                                        lz_mf_get_bytes_remaining(ctx->mf),
-                                                        &ctx->optimum[cur_pos].state.lru,
-                                                        &longest_rep_offset);
-               } else {
-                       longest_rep_len = 0;
-               }
+                               c->optimum[0].state.lru.upcoming_offset =
+                                       c->matches[num_matches - 1].offset;
 
 
-               if (longest_rep_len >= ctx->params.nice_match_length) {
-                       match = lzms_match_chooser_reverse_list(ctx, cur_pos);
+                               lzms_update_lz_lru_queue(&c->optimum[0].state.lru);
+                               goto begin;
+                       }
 
 
-                       ctx->optimum[cur_pos].next.match_offset = longest_rep_offset;
-                       ctx->optimum[cur_pos].next.link = cur_pos + longest_rep_len;
-                       ctx->optimum_end_idx = cur_pos + longest_rep_len;
+                       /* If reaching any positions for the first time,
+                        * initialize their costs to "infinity".  */
+                       while (end_optimum_ptr < cur_optimum_ptr + longest_len)
+                               (++end_optimum_ptr)->cost = MC_INFINITE_COST;
 
 
-                       lzms_skip_bytes(ctx, longest_rep_len);
+                       /* Consider coding an explicit offset match.  */
+                       lzms_consider_lz_explicit_offset_matches(c, cur_optimum_ptr,
+                                                                c->matches, num_matches);
+               } else {
+                       /* No matches found.  The only choice at this position
+                        * is to code a literal.  */
 
 
-                       return match;
+                       if (end_optimum_ptr == cur_optimum_ptr)
+                               (++end_optimum_ptr)->cost = MC_INFINITE_COST;
                }
 
                }
 
-               num_matches = lzms_get_matches(ctx, &matches);
+               /* Consider coding a literal.
 
 
-               if (num_matches) {
-                       longest_len = matches[num_matches - 1].len;
-                       if (longest_len >= ctx->params.nice_match_length) {
-                               match = lzms_match_chooser_reverse_list(ctx, cur_pos);
+                * To avoid an extra unpredictable brench, actually checking the
+                * preferability of coding a literal is integrated into the
+                * adaptive state update code below.  */
+               literal = *window_ptr++;
+               cost = cur_optimum_ptr->cost +
+                      lzms_literal_cost(c, literal, &cur_optimum_ptr->state);
 
 
-                               ctx->optimum[cur_pos].next.match_offset =
-                                       matches[num_matches - 1].offset;
-                               ctx->optimum[cur_pos].next.link = cur_pos + longest_len;
-                               ctx->optimum_end_idx = cur_pos + longest_len;
+               /* Advance to the next position.  */
+               cur_optimum_ptr++;
 
 
-                               lzms_skip_bytes(ctx, longest_len - 1);
+               /* The lowest-cost path to the current position is now known.
+                * Finalize the adaptive state that results from taking this
+                * lowest-cost path.  */
 
 
-                               return match;
-                       }
-               } else {
-                       longest_len = 1;
-               }
+               if (cost < cur_optimum_ptr->cost) {
+                       /* Literal  */
+                       cur_optimum_ptr->cost = cost;
+                       cur_optimum_ptr->mc_item_data = ((u64)literal << MC_OFFSET_SHIFT) | 1;
 
 
-               while (end_pos < cur_pos + longest_len)
-                       ctx->optimum[++end_pos].cost = MC_INFINITE_COST;
-
-               state = ctx->optimum[cur_pos].state;
-               cost = ctx->optimum[cur_pos].cost +
-                       lzms_get_literal_cost(ctx,
-                                             &state,
-                                             *(lz_mf_get_window_ptr(ctx->mf) - 1));
-               if (cost < ctx->optimum[cur_pos + 1].cost) {
-                       ctx->optimum[cur_pos + 1].state = state;
-                       ctx->optimum[cur_pos + 1].cost = cost;
-                       ctx->optimum[cur_pos + 1].prev.link = cur_pos;
-               }
+                       cur_optimum_ptr->state = (cur_optimum_ptr - 1)->state;
 
 
-               for (u32 i = 0, len = 2; i < num_matches; i++) {
-                       u32 offset = matches[i].offset;
-                       struct lzms_adaptive_state state;
-                       u32 position_cost;
+                       lzms_update_main_state(&cur_optimum_ptr->state, 0);
 
 
-                       state = ctx->optimum[cur_pos].state;
-                       position_cost = ctx->optimum[cur_pos].cost;
-                       position_cost += lzms_get_lz_match_cost_nolen(ctx, &state, offset);
+                       cur_optimum_ptr->state.lru.upcoming_offset = 0;
+               } else {
+                       /* LZ match  */
+                       len = cur_optimum_ptr->mc_item_data & MC_LEN_MASK;
+                       offset_data = cur_optimum_ptr->mc_item_data >> MC_OFFSET_SHIFT;
 
 
-                       do {
-                               u32 cost;
+                       cur_optimum_ptr->state = (cur_optimum_ptr - len)->state;
 
 
-                               cost = position_cost;
-                               cost += lzms_get_length_cost(&ctx->length_encoder, len);
+                       lzms_update_main_state(&cur_optimum_ptr->state, 1);
+                       lzms_update_match_state(&cur_optimum_ptr->state, 0);
 
 
-                               if (cost < ctx->optimum[cur_pos + len].cost) {
-                                       ctx->optimum[cur_pos + len].state = state;
-                                       ctx->optimum[cur_pos + len].prev.link = cur_pos;
-                                       ctx->optimum[cur_pos + len].prev.match_offset = offset;
-                                       ctx->optimum[cur_pos + len].cost = cost;
-                               }
-                       } while (++len <= matches[i].len);
-               }
+                       if (offset_data >= LZMS_NUM_RECENT_OFFSETS) {
 
 
-               if (longest_rep_len >= ctx->params.min_match_length) {
-
-                       while (end_pos < cur_pos + longest_rep_len)
-                               ctx->optimum[++end_pos].cost = MC_INFINITE_COST;
-
-                       state = ctx->optimum[cur_pos].state;
-
-                       cost = ctx->optimum[cur_pos].cost +
-                               lzms_get_lz_match_cost(ctx,
-                                                      &state,
-                                                      longest_rep_len,
-                                                      longest_rep_offset);
-                       if (cost <= ctx->optimum[cur_pos + longest_rep_len].cost) {
-                               ctx->optimum[cur_pos + longest_rep_len].state =
-                                       state;
-                               ctx->optimum[cur_pos + longest_rep_len].prev.link =
-                                       cur_pos;
-                               ctx->optimum[cur_pos + longest_rep_len].prev.match_offset =
-                                       longest_rep_offset;
-                               ctx->optimum[cur_pos + longest_rep_len].cost =
-                                       cost;
-                       }
-               }
-       }
-}
+                               /* Explicit offset LZ match  */
 
 
-/*
- * The main loop for the LZMS compressor.
- *
- * Notes:
- *
- * - This does not output any delta matches.
- *
- * - The costs of literals and matches are estimated using the range encoder
- *   states and the semi-adaptive Huffman codes.  Except for range encoding
- *   states, costs are assumed to be constant throughout a single run of the
- *   parsing algorithm, which can parse up to @optim_array_length bytes of data.
- *   This introduces a source of inaccuracy because the probabilities and
- *   Huffman codes can change over this part of the data.
- */
-static void
-lzms_encode(struct lzms_compressor *ctx)
-{
-       struct lz_match item;
+                               lzms_update_lz_match_state(&cur_optimum_ptr->state, 0);
+
+                               cur_optimum_ptr->state.lru.upcoming_offset =
+                                       offset_data - LZMS_OFFSET_OFFSET;
+                       } else {
+                               /* Repeat offset LZ match  */
 
 
-       /* Load window into the match-finder.  */
-       lz_mf_load_window(ctx->mf, ctx->window, ctx->window_size);
+                               lzms_update_lz_match_state(&cur_optimum_ptr->state, 1);
+                               lzms_update_lz_repeat_match_state(&cur_optimum_ptr->state,
+                                                                 offset_data);
 
 
-       /* Reset the match-chooser.  */
-       ctx->optimum_cur_idx = 0;
-       ctx->optimum_end_idx = 0;
+                               cur_optimum_ptr->state.lru.upcoming_offset =
+                                       cur_optimum_ptr->state.lru.recent_offsets[offset_data];
 
 
-       while (ctx->cur_window_pos != ctx->window_size) {
-               item = lzms_get_near_optimal_item(ctx);
-               if (item.len <= 1)
-                       lzms_encode_literal(ctx, ctx->window[ctx->cur_window_pos]);
-               else
-                       lzms_encode_lz_match(ctx, item.len, item.offset);
+                               for (i = offset_data; i < LZMS_NUM_RECENT_OFFSETS; i++)
+                                       cur_optimum_ptr->state.lru.recent_offsets[i] =
+                                               cur_optimum_ptr->state.lru.recent_offsets[i + 1];
+                       }
+               }
+
+               lzms_update_lz_lru_queue(&cur_optimum_ptr->state.lru);
+
+               /*
+                * This loop will terminate when either of the following
+                * conditions is true:
+                *
+                * (1) cur_optimum_ptr == end_optimum_ptr
+                *
+                *      There are no paths that extend beyond the current
+                *      position.  In this case, any path to a later position
+                *      must pass through the current position, so we can go
+                *      ahead and choose the list of items that led to this
+                *      position.
+                *
+                * (2) cur_optimum_ptr == c->optimum_end
+                *
+                *      This bounds the number of times the algorithm can step
+                *      forward before it is guaranteed to start choosing items.
+                *      This limits the memory usage.  It also guarantees that
+                *      the parser will not go too long without updating the
+                *      probability tables.
+                *
+                * Note: no check for end-of-block is needed because
+                * end-of-block will trigger condition (1).
+                */
+               if (cur_optimum_ptr == end_optimum_ptr ||
+                   cur_optimum_ptr == c->optimum_end)
+               {
+                       c->optimum[0].state = cur_optimum_ptr->state;
+                       break;
+               }
        }
        }
+
+       /* Output the current list of items that constitute the minimum-cost
+        * path to the current position.  */
+       lzms_encode_item_list(c, cur_optimum_ptr);
+       goto begin;
 }
 
 static void
 }
 
 static void
@@ -1154,6 +1241,7 @@ lzms_init_range_encoder(struct lzms_range_encoder *enc,
 {
        enc->rc = rc;
        enc->state = 0;
 {
        enc->rc = rc;
        enc->state = 0;
+       LZMS_ASSERT(is_power_of_2(num_states));
        enc->mask = num_states - 1;
        for (u32 i = 0; i < num_states; i++) {
                enc->prob_entries[i].num_recent_zero_bits = LZMS_INITIAL_PROBABILITY;
        enc->mask = num_states - 1;
        for (u32 i = 0; i < num_states; i++) {
                enc->prob_entries[i].num_recent_zero_bits = LZMS_INITIAL_PROBABILITY;
@@ -1181,77 +1269,72 @@ lzms_init_huffman_encoder(struct lzms_huffman_encoder *enc,
                                    enc->codewords);
 }
 
                                    enc->codewords);
 }
 
-/* Initialize the LZMS compressor.  */
+/* Prepare the LZMS compressor for compressing a block of data.  */
 static void
 static void
-lzms_init_compressor(struct lzms_compressor *ctx, const u8 *udata, u32 ulen,
-                    le16 *cdata, u32 clen16)
+lzms_prepare_compressor(struct lzms_compressor *c, const u8 *udata, u32 ulen,
+                       le16 *cdata, u32 clen16)
 {
 {
-       unsigned num_position_slots;
+       unsigned num_offset_slots;
 
 
-       /* Copy the uncompressed data into the @ctx->window buffer.  */
-       memcpy(ctx->window, udata, ulen);
-       ctx->cur_window_pos = 0;
-       ctx->window_size = ulen;
+       /* Copy the uncompressed data into the @c->cur_window buffer.  */
+       memcpy(c->cur_window, udata, ulen);
+       c->cur_window_size = ulen;
 
        /* Initialize the raw range encoder (writing forwards).  */
 
        /* Initialize the raw range encoder (writing forwards).  */
-       lzms_range_encoder_raw_init(&ctx->rc, cdata, clen16);
+       lzms_range_encoder_raw_init(&c->rc, cdata, clen16);
 
        /* Initialize the output bitstream for Huffman symbols and verbatim bits
         * (writing backwards).  */
 
        /* Initialize the output bitstream for Huffman symbols and verbatim bits
         * (writing backwards).  */
-       lzms_output_bitstream_init(&ctx->os, cdata, clen16);
-
-       /* Calculate the number of position slots needed for this compressed
-        * block.  */
-       num_position_slots = lzms_get_position_slot(ulen - 1) + 1;
+       lzms_output_bitstream_init(&c->os, cdata, clen16);
 
 
-       LZMS_DEBUG("Using %u position slots", num_position_slots);
+       /* Calculate the number of offset slots required.  */
+       num_offset_slots = lzms_get_offset_slot(ulen - 1) + 1;
 
 
-       /* Initialize Huffman encoders for each alphabet used in the compressed
-        * representation.  */
-       lzms_init_huffman_encoder(&ctx->literal_encoder, &ctx->os,
+       /* Initialize a Huffman encoder for each alphabet.  */
+       lzms_init_huffman_encoder(&c->literal_encoder, &c->os,
                                  LZMS_NUM_LITERAL_SYMS,
                                  LZMS_LITERAL_CODE_REBUILD_FREQ);
 
                                  LZMS_NUM_LITERAL_SYMS,
                                  LZMS_LITERAL_CODE_REBUILD_FREQ);
 
-       lzms_init_huffman_encoder(&ctx->lz_offset_encoder, &ctx->os,
-                                 num_position_slots,
+       lzms_init_huffman_encoder(&c->lz_offset_encoder, &c->os,
+                                 num_offset_slots,
                                  LZMS_LZ_OFFSET_CODE_REBUILD_FREQ);
 
                                  LZMS_LZ_OFFSET_CODE_REBUILD_FREQ);
 
-       lzms_init_huffman_encoder(&ctx->length_encoder, &ctx->os,
+       lzms_init_huffman_encoder(&c->length_encoder, &c->os,
                                  LZMS_NUM_LEN_SYMS,
                                  LZMS_LENGTH_CODE_REBUILD_FREQ);
 
                                  LZMS_NUM_LEN_SYMS,
                                  LZMS_LENGTH_CODE_REBUILD_FREQ);
 
-       lzms_init_huffman_encoder(&ctx->delta_offset_encoder, &ctx->os,
-                                 num_position_slots,
+       lzms_init_huffman_encoder(&c->delta_offset_encoder, &c->os,
+                                 num_offset_slots,
                                  LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ);
 
                                  LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ);
 
-       lzms_init_huffman_encoder(&ctx->delta_power_encoder, &ctx->os,
+       lzms_init_huffman_encoder(&c->delta_power_encoder, &c->os,
                                  LZMS_NUM_DELTA_POWER_SYMS,
                                  LZMS_DELTA_POWER_CODE_REBUILD_FREQ);
 
        /* Initialize range encoders, all of which wrap around the same
         * lzms_range_encoder_raw.  */
                                  LZMS_NUM_DELTA_POWER_SYMS,
                                  LZMS_DELTA_POWER_CODE_REBUILD_FREQ);
 
        /* Initialize range encoders, all of which wrap around the same
         * lzms_range_encoder_raw.  */
-       lzms_init_range_encoder(&ctx->main_range_encoder,
-                               &ctx->rc, LZMS_NUM_MAIN_STATES);
+       lzms_init_range_encoder(&c->main_range_encoder,
+                               &c->rc, LZMS_NUM_MAIN_STATES);
 
 
-       lzms_init_range_encoder(&ctx->match_range_encoder,
-                               &ctx->rc, LZMS_NUM_MATCH_STATES);
+       lzms_init_range_encoder(&c->match_range_encoder,
+                               &c->rc, LZMS_NUM_MATCH_STATES);
 
 
-       lzms_init_range_encoder(&ctx->lz_match_range_encoder,
-                               &ctx->rc, LZMS_NUM_LZ_MATCH_STATES);
+       lzms_init_range_encoder(&c->lz_match_range_encoder,
+                               &c->rc, LZMS_NUM_LZ_MATCH_STATES);
 
 
-       for (size_t i = 0; i < ARRAY_LEN(ctx->lz_repeat_match_range_encoders); i++)
-               lzms_init_range_encoder(&ctx->lz_repeat_match_range_encoders[i],
-                                       &ctx->rc, LZMS_NUM_LZ_REPEAT_MATCH_STATES);
+       for (unsigned i = 0; i < ARRAY_LEN(c->lz_repeat_match_range_encoders); i++)
+               lzms_init_range_encoder(&c->lz_repeat_match_range_encoders[i],
+                                       &c->rc, LZMS_NUM_LZ_REPEAT_MATCH_STATES);
 
 
-       lzms_init_range_encoder(&ctx->delta_match_range_encoder,
-                               &ctx->rc, LZMS_NUM_DELTA_MATCH_STATES);
+       lzms_init_range_encoder(&c->delta_match_range_encoder,
+                               &c->rc, LZMS_NUM_DELTA_MATCH_STATES);
 
 
-       for (size_t i = 0; i < ARRAY_LEN(ctx->delta_repeat_match_range_encoders); i++)
-               lzms_init_range_encoder(&ctx->delta_repeat_match_range_encoders[i],
-                                       &ctx->rc, LZMS_NUM_DELTA_REPEAT_MATCH_STATES);
+       for (unsigned i = 0; i < ARRAY_LEN(c->delta_repeat_match_range_encoders); i++)
+               lzms_init_range_encoder(&c->delta_repeat_match_range_encoders[i],
+                                       &c->rc, LZMS_NUM_DELTA_REPEAT_MATCH_STATES);
 
 
-       /* Initialize LRU match information.  */
-       lzms_init_lru_queues(&ctx->lru);
+       /* Set initial length costs for lengths < LZMS_NUM_FAST_LENGTHS.  */
+       lzms_update_fast_length_costs(c);
 }
 
 /* Flush the output streams, prepare the final compressed data, and return its
 }
 
 /* Flush the output streams, prepare the final compressed data, and return its
@@ -1260,66 +1343,77 @@ lzms_init_compressor(struct lzms_compressor *ctx, const u8 *udata, u32 ulen,
  * A return value of 0 indicates that the data could not be compressed to fit in
  * the available space.  */
 static size_t
  * A return value of 0 indicates that the data could not be compressed to fit in
  * the available space.  */
 static size_t
-lzms_finalize(struct lzms_compressor *ctx, u8 *cdata, size_t csize_avail)
+lzms_finalize(struct lzms_compressor *c, u8 *cdata, size_t csize_avail)
 {
        size_t num_forwards_bytes;
        size_t num_backwards_bytes;
 {
        size_t num_forwards_bytes;
        size_t num_backwards_bytes;
-       size_t compressed_size;
 
        /* Flush both the forwards and backwards streams, and make sure they
         * didn't cross each other and start overwriting each other's data.  */
 
        /* Flush both the forwards and backwards streams, and make sure they
         * didn't cross each other and start overwriting each other's data.  */
-       if (!lzms_output_bitstream_flush(&ctx->os)) {
-               LZMS_DEBUG("Backwards bitstream overrun.");
+       if (!lzms_output_bitstream_flush(&c->os))
                return 0;
                return 0;
-       }
 
 
-       if (!lzms_range_encoder_raw_flush(&ctx->rc)) {
-               LZMS_DEBUG("Forwards bitstream overrun.");
+       if (!lzms_range_encoder_raw_flush(&c->rc))
                return 0;
                return 0;
-       }
 
 
-       if (ctx->rc.out > ctx->os.out) {
-               LZMS_DEBUG("Two bitstreams crossed.");
+       if (c->rc.next > c->os.next)
                return 0;
                return 0;
-       }
 
        /* Now the compressed buffer contains the data output by the forwards
         * bitstream, then empty space, then data output by the backwards
         * bitstream.  Move the data output by the backwards bitstream to be
         * adjacent to the data output by the forward bitstream, and calculate
         * the compressed size that this results in.  */
 
        /* Now the compressed buffer contains the data output by the forwards
         * bitstream, then empty space, then data output by the backwards
         * bitstream.  Move the data output by the backwards bitstream to be
         * adjacent to the data output by the forward bitstream, and calculate
         * the compressed size that this results in.  */
-       num_forwards_bytes = (u8*)ctx->rc.out - (u8*)cdata;
-       num_backwards_bytes = ((u8*)cdata + csize_avail) - (u8*)ctx->os.out;
+       num_forwards_bytes = (u8*)c->rc.next - (u8*)cdata;
+       num_backwards_bytes = ((u8*)cdata + csize_avail) - (u8*)c->os.next;
 
 
-       memmove(cdata + num_forwards_bytes, ctx->os.out, num_backwards_bytes);
+       memmove(cdata + num_forwards_bytes, c->os.next, num_backwards_bytes);
 
 
-       compressed_size = num_forwards_bytes + num_backwards_bytes;
-       LZMS_DEBUG("num_forwards_bytes=%zu, num_backwards_bytes=%zu, "
-                  "compressed_size=%zu",
-                  num_forwards_bytes, num_backwards_bytes, compressed_size);
-       LZMS_ASSERT(compressed_size % 2 == 0);
-       return compressed_size;
+       return num_forwards_bytes + num_backwards_bytes;
 }
 
 }
 
-
+/* Set internal compression parameters for the specified compression level and
+ * maximum window size.  */
 static void
 lzms_build_params(unsigned int compression_level,
                  struct lzms_compressor_params *lzms_params)
 {
 static void
 lzms_build_params(unsigned int compression_level,
                  struct lzms_compressor_params *lzms_params)
 {
-       lzms_params->min_match_length  = (compression_level >= 50) ? 2 : 3;
-       lzms_params->nice_match_length = max(((u64)compression_level * 32) / 50,
-                                            lzms_params->min_match_length);
-       lzms_params->max_search_depth  = ((u64)compression_level * 50) / 50;
-       lzms_params->optim_array_length = 224 + compression_level * 16;
+       /* Allow length 2 matches if the compression level is sufficiently high.
+        */
+       if (compression_level >= 45)
+               lzms_params->min_match_length = 2;
+       else
+               lzms_params->min_match_length = 3;
+
+       /* Scale nice_match_length and max_search_depth with the compression
+        * level.  But to allow an optimization on length cost calculations,
+        * don't allow nice_match_length to exceed LZMS_NUM_FAST_LENGTH.  */
+       lzms_params->nice_match_length = ((u64)compression_level * 32) / 50;
+       if (lzms_params->nice_match_length < lzms_params->min_match_length)
+               lzms_params->nice_match_length = lzms_params->min_match_length;
+       if (lzms_params->nice_match_length > LZMS_NUM_FAST_LENGTHS)
+               lzms_params->nice_match_length = LZMS_NUM_FAST_LENGTHS;
+       lzms_params->max_search_depth = compression_level;
+
+       lzms_params->optim_array_length = 1024;
 }
 
 }
 
+/* Given the internal compression parameters and maximum window size, build the
+ * Lempel-Ziv match-finder parameters.  */
 static void
 lzms_build_mf_params(const struct lzms_compressor_params *lzms_params,
                     u32 max_window_size, struct lz_mf_params *mf_params)
 {
        memset(mf_params, 0, sizeof(*mf_params));
 
 static void
 lzms_build_mf_params(const struct lzms_compressor_params *lzms_params,
                     u32 max_window_size, struct lz_mf_params *mf_params)
 {
        memset(mf_params, 0, sizeof(*mf_params));
 
-       mf_params->algorithm = LZ_MF_DEFAULT;
+       /* Choose an appropriate match-finding algorithm.  */
+       if (max_window_size <= 2097152)
+               mf_params->algorithm = LZ_MF_BINARY_TREES;
+       else if (max_window_size <= 33554432)
+               mf_params->algorithm = LZ_MF_LCP_INTERVAL_TREE;
+       else
+               mf_params->algorithm = LZ_MF_LINKED_SUFFIX_ARRAY;
+
        mf_params->max_window_size = max_window_size;
        mf_params->min_match_len = lzms_params->min_match_length;
        mf_params->max_search_depth = lzms_params->max_search_depth;
        mf_params->max_window_size = max_window_size;
        mf_params->min_match_len = lzms_params->min_match_length;
        mf_params->max_search_depth = lzms_params->max_search_depth;
@@ -1327,23 +1421,34 @@ lzms_build_mf_params(const struct lzms_compressor_params *lzms_params,
 }
 
 static void
 }
 
 static void
-lzms_free_compressor(void *_ctx);
+lzms_free_compressor(void *_c);
 
 static u64
 lzms_get_needed_memory(size_t max_block_size, unsigned int compression_level)
 {
        struct lzms_compressor_params params;
 
 static u64
 lzms_get_needed_memory(size_t max_block_size, unsigned int compression_level)
 {
        struct lzms_compressor_params params;
+       struct lz_mf_params mf_params;
        u64 size = 0;
 
        if (max_block_size >= INT32_MAX)
                return 0;
 
        lzms_build_params(compression_level, &params);
        u64 size = 0;
 
        if (max_block_size >= INT32_MAX)
                return 0;
 
        lzms_build_params(compression_level, &params);
+       lzms_build_mf_params(&params, max_block_size, &mf_params);
 
        size += sizeof(struct lzms_compressor);
 
        size += sizeof(struct lzms_compressor);
+
+       /* cur_window */
        size += max_block_size;
        size += max_block_size;
-       size += lz_mf_get_needed_memory(LZ_MF_DEFAULT, max_block_size);
-       size += params.max_search_depth * sizeof(struct lz_match);
+
+       /* mf */
+       size += lz_mf_get_needed_memory(mf_params.algorithm, max_block_size);
+
+       /* matches */
+       size += min(params.max_search_depth, params.nice_match_length) *
+               sizeof(struct lz_match);
+
+       /* optimum */
        size += (params.optim_array_length + params.nice_match_length) *
                sizeof(struct lzms_mc_pos_data);
 
        size += (params.optim_array_length + params.nice_match_length) *
                sizeof(struct lzms_mc_pos_data);
 
@@ -1354,7 +1459,7 @@ static int
 lzms_create_compressor(size_t max_block_size, unsigned int compression_level,
                       void **ctx_ret)
 {
 lzms_create_compressor(size_t max_block_size, unsigned int compression_level,
                       void **ctx_ret)
 {
-       struct lzms_compressor *ctx;
+       struct lzms_compressor *c;
        struct lzms_compressor_params params;
        struct lz_mf_params mf_params;
 
        struct lzms_compressor_params params;
        struct lz_mf_params mf_params;
 
@@ -1366,60 +1471,56 @@ lzms_create_compressor(size_t max_block_size, unsigned int compression_level,
        if (!lz_mf_params_valid(&mf_params))
                return WIMLIB_ERR_INVALID_PARAM;
 
        if (!lz_mf_params_valid(&mf_params))
                return WIMLIB_ERR_INVALID_PARAM;
 
-       ctx = CALLOC(1, sizeof(struct lzms_compressor));
-       if (!ctx)
+       c = CALLOC(1, sizeof(struct lzms_compressor));
+       if (!c)
                goto oom;
 
                goto oom;
 
-       ctx->params = params;
-       ctx->max_block_size = max_block_size;
+       c->params = params;
 
 
-       ctx->window = MALLOC(max_block_size);
-       if (!ctx->window)
+       c->cur_window = MALLOC(max_block_size);
+       if (!c->cur_window)
                goto oom;
 
                goto oom;
 
-       ctx->mf = lz_mf_alloc(&mf_params);
-       if (!ctx->mf)
+       c->mf = lz_mf_alloc(&mf_params);
+       if (!c->mf)
                goto oom;
 
                goto oom;
 
-       ctx->matches = MALLOC(params.max_search_depth * sizeof(struct lz_match));
-       if (!ctx->matches)
+       c->matches = MALLOC(min(params.max_search_depth,
+                               params.nice_match_length) *
+                           sizeof(struct lz_match));
+       if (!c->matches)
                goto oom;
 
                goto oom;
 
-       ctx->optimum = MALLOC((params.optim_array_length +
-                              params.nice_match_length) *
-                               sizeof(struct lzms_mc_pos_data));
-       if (!ctx->optimum)
+       c->optimum = MALLOC((params.optim_array_length +
+                            params.nice_match_length) *
+                           sizeof(struct lzms_mc_pos_data));
+       if (!c->optimum)
                goto oom;
                goto oom;
+       c->optimum_end = &c->optimum[params.optim_array_length];
 
 
-       /* Initialize position and length slot data if not done already.  */
        lzms_init_slots();
 
        lzms_init_slots();
 
-       /* Initialize range encoding cost table if not done already.  */
        lzms_init_rc_costs();
 
        lzms_init_rc_costs();
 
-       *ctx_ret = ctx;
+       lzms_init_fast_slots(c);
+
+       *ctx_ret = c;
        return 0;
 
 oom:
        return 0;
 
 oom:
-       lzms_free_compressor(ctx);
+       lzms_free_compressor(c);
        return WIMLIB_ERR_NOMEM;
 }
 
 static size_t
 lzms_compress(const void *uncompressed_data, size_t uncompressed_size,
        return WIMLIB_ERR_NOMEM;
 }
 
 static size_t
 lzms_compress(const void *uncompressed_data, size_t uncompressed_size,
-             void *compressed_data, size_t compressed_size_avail, void *_ctx)
+             void *compressed_data, size_t compressed_size_avail, void *_c)
 {
 {
-       struct lzms_compressor *ctx = _ctx;
-       size_t compressed_size;
-
-       LZMS_DEBUG("uncompressed_size=%zu, compressed_size_avail=%zu",
-                  uncompressed_size, compressed_size_avail);
+       struct lzms_compressor *c = _c;
 
        /* Don't bother compressing extremely small inputs.  */
 
        /* Don't bother compressing extremely small inputs.  */
-       if (uncompressed_size < 4) {
-               LZMS_DEBUG("Input too small to bother compressing.");
+       if (uncompressed_size < 4)
                return 0;
                return 0;
-       }
 
        /* Cap the available compressed size to a 32-bit integer and round it
         * down to the nearest multiple of 2.  */
 
        /* Cap the available compressed size to a 32-bit integer and round it
         * down to the nearest multiple of 2.  */
@@ -1429,43 +1530,35 @@ lzms_compress(const void *uncompressed_data, size_t uncompressed_size,
                compressed_size_avail--;
 
        /* Initialize the compressor structures.  */
                compressed_size_avail--;
 
        /* Initialize the compressor structures.  */
-       lzms_init_compressor(ctx, uncompressed_data, uncompressed_size,
-                            compressed_data, compressed_size_avail / 2);
+       lzms_prepare_compressor(c, uncompressed_data, uncompressed_size,
+                               compressed_data, compressed_size_avail / 2);
 
        /* Preprocess the uncompressed data.  */
 
        /* Preprocess the uncompressed data.  */
-       lzms_x86_filter(ctx->window, ctx->window_size,
-                       ctx->last_target_usages, false);
+       lzms_x86_filter(c->cur_window, c->cur_window_size,
+                       c->last_target_usages, false);
+
+       /* Load the window into the match-finder.  */
+       lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size);
 
        /* Compute and encode a literal/match sequence that decompresses to the
         * preprocessed data.  */
 
        /* Compute and encode a literal/match sequence that decompresses to the
         * preprocessed data.  */
-       lzms_encode(ctx);
-
-       /* Get and return the compressed data size.  */
-       compressed_size = lzms_finalize(ctx, compressed_data,
-                                       compressed_size_avail);
-
-       if (compressed_size == 0) {
-               LZMS_DEBUG("Data did not compress to requested size or less.");
-               return 0;
-       }
-
-       LZMS_DEBUG("Compressed %zu => %zu bytes",
-                  uncompressed_size, compressed_size);
+       lzms_near_optimal_parse(c);
 
 
-       return compressed_size;
+       /* Return the compressed data size or 0.  */
+       return lzms_finalize(c, compressed_data, compressed_size_avail);
 }
 
 static void
 }
 
 static void
-lzms_free_compressor(void *_ctx)
+lzms_free_compressor(void *_c)
 {
 {
-       struct lzms_compressor *ctx = _ctx;
-
-       if (ctx) {
-               FREE(ctx->window);
-               lz_mf_free(ctx->mf);
-               FREE(ctx->matches);
-               FREE(ctx->optimum);
-               FREE(ctx);
+       struct lzms_compressor *c = _c;
+
+       if (c) {
+               FREE(c->cur_window);
+               lz_mf_free(c->mf);
+               FREE(c->matches);
+               FREE(c->optimum);
+               FREE(c);
        }
 }
 
        }
 }
 
index 7254091449c052442bae510444f93d7f92b9a9d0..e2037d4488e93858cdb8316568a3c4f246b3d682 100644 (file)
@@ -3,7 +3,7 @@
  */
 
 /*
  */
 
 /*
- * Copyright (C) 2013 Eric Biggers
+ * Copyright (C) 2013, 2014 Eric Biggers
  *
  * This file is part of wimlib, a library for working with WIM files.
  *
  *
  * This file is part of wimlib, a library for working with WIM files.
  *
  *
  * For LZ matches, up to 3 repeat offsets are allowed, similar to some other
  * LZ-based formats such as LZX and LZMA.  They must updated in an LRU fashion,
  *
  * For LZ matches, up to 3 repeat offsets are allowed, similar to some other
  * LZ-based formats such as LZX and LZMA.  They must updated in an LRU fashion,
- * except for a quirk: updates to the queue must be delayed by one LZMS item,
- * except for the removal of a repeat match.  As a result, 4 entries are
- * actually needed in the queue, even though it is only possible to decode
- * references to the first 3 at any given time.  The queue must be initialized
- * to the offsets {1, 2, 3, 4}.
+ * except for a quirk: inserting anything to the front of the queue must be
+ * delayed by one LZMS item.  The reason for this is presumably that there is
+ * almost no reason to code the same match offset twice in a row, since you
+ * might as well have coded a longer match at that offset.  For this same
+ * reason, it also is a requirement that when an offset in the queue is used,
+ * that offset is removed from the queue immediately (and made pending for
+ * front-insertion after the following decoded item), and everything to the
+ * right is shifted left one queue slot.  This creates a need for an "overflow"
+ * fourth entry in the queue, even though it is only possible to decode
+ * references to the first 3 entries at any given time.  The queue must be
+ * initialized to the offsets {1, 2, 3, 4}.
  *
  * Repeat delta matches are handled similarly, but for them there are two queues
  * updated in lock-step: one for powers and one for raw offsets.  The power
  *
  * Repeat delta matches are handled similarly, but for them there are two queues
  * updated in lock-step: one for powers and one for raw offsets.  The power
  *    1024 symbols have been decoded with it.
  *
  *  - The LZ offset code, used for decoding the offsets of standard LZ77
  *    1024 symbols have been decoded with it.
  *
  *  - The LZ offset code, used for decoding the offsets of standard LZ77
- *    matches.  Each symbol represents a position slot, which corresponds to a
+ *    matches.  Each symbol represents an offset slot, which corresponds to a
  *    base value and some number of extra bits which must be read and added to
  *    the base value to reconstitute the full offset.  The number of symbols in
  *    base value and some number of extra bits which must be read and added to
  *    the base value to reconstitute the full offset.  The number of symbols in
- *    this code is the number of position slots needed to represent all possible
+ *    this code is the number of offset slots needed to represent all possible
  *    offsets in the uncompressed block.  This code must be rebuilt whenever
  *    1024 symbols have been decoded with it.
  *
  *    offsets in the uncompressed block.  This code must be rebuilt whenever
  *    1024 symbols have been decoded with it.
  *
  *    symbols have been decoded with it.
  *
  *  - The delta offset code, used for decoding the offsets of delta matches.
  *    symbols have been decoded with it.
  *
  *  - The delta offset code, used for decoding the offsets of delta matches.
- *    Each symbol corresponds to a position slot, which corresponds to a base
+ *    Each symbol corresponds to an offset slot, which corresponds to a base
  *    value and some number of extra bits which must be read and added to the
  *    base value to reconstitute the full offset.  The number of symbols in this
  *    code is equal to the number of symbols in the LZ offset code.  This code
  *    value and some number of extra bits which must be read and added to the
  *    base value to reconstitute the full offset.  The number of symbols in this
  *    code is equal to the number of symbols in the LZ offset code.  This code
@@ -508,38 +514,15 @@ lzms_range_decode_bit(struct lzms_range_decoder *dec)
        /* Load the probability entry corresponding to the current state.  */
        prob_entry = &dec->prob_entries[dec->state];
 
        /* Load the probability entry corresponding to the current state.  */
        prob_entry = &dec->prob_entries[dec->state];
 
-       /* Treat the number of zero bits in the most recently decoded
-        * LZMS_PROBABILITY_MAX bits with this probability entry as the chance,
-        * out of LZMS_PROBABILITY_MAX, that the next bit will be a 0.  However,
-        * don't allow 0% or 100% probabilities.  */
-       prob = prob_entry->num_recent_zero_bits;
-       if (prob == LZMS_PROBABILITY_MAX)
-               prob = LZMS_PROBABILITY_MAX - 1;
-       else if (prob == 0)
-               prob = 1;
+       /* Get the probability that the next bit is 0.  */
+       prob = lzms_get_probability(prob_entry);
 
        /* Decode the next bit.  */
        bit = lzms_range_decoder_raw_decode_bit(dec->rd, prob);
 
 
        /* Decode the next bit.  */
        bit = lzms_range_decoder_raw_decode_bit(dec->rd, prob);
 
-       /* Update the state based on the newly decoded bit.  */
+       /* Update the state and probability entry based on the decoded bit.  */
        dec->state = (((dec->state << 1) | bit) & dec->mask);
        dec->state = (((dec->state << 1) | bit) & dec->mask);
-
-       /* Update the recent bits, including the cached count of 0's.  */
-       BUILD_BUG_ON(LZMS_PROBABILITY_MAX > sizeof(prob_entry->recent_bits) * 8);
-       if (bit == 0) {
-               if (prob_entry->recent_bits & (1ULL << (LZMS_PROBABILITY_MAX - 1))) {
-                       /* Replacing 1 bit with 0 bit; increment the zero count.
-                        */
-                       prob_entry->num_recent_zero_bits++;
-               }
-       } else {
-               if (!(prob_entry->recent_bits & (1ULL << (LZMS_PROBABILITY_MAX - 1)))) {
-                       /* Replacing 0 bit with 1 bit; decrement the zero count.
-                        */
-                       prob_entry->num_recent_zero_bits--;
-               }
-       }
-       prob_entry->recent_bits = (prob_entry->recent_bits << 1) | bit;
+       lzms_update_probability_entry(prob_entry, bit);
 
        /* Return the decoded bit.  */
        return bit;
 
        /* Return the decoded bit.  */
        return bit;
@@ -647,8 +630,8 @@ lzms_decode_value(struct lzms_huffman_decoder *dec)
        LZMS_ASSERT(dec->slot_base_tab != NULL);
        LZMS_ASSERT(dec->extra_bits_tab != NULL);
 
        LZMS_ASSERT(dec->slot_base_tab != NULL);
        LZMS_ASSERT(dec->extra_bits_tab != NULL);
 
-       /* Read the slot (position slot, length slot, etc.), which is encoded as
-        * Huffman symbol.  */
+       /* Read the slot (offset slot, length slot, etc.), which is encoded as a
+        * Huffman symbol.  */
        slot = lzms_huffman_decode_symbol(dec);
 
        /* Get the number of extra bits needed to represent the range of values
        slot = lzms_huffman_decode_symbol(dec);
 
        /* Get the number of extra bits needed to represent the range of values
@@ -887,7 +870,7 @@ lzms_init_decompressor(struct lzms_decompressor *ctx,
                       const void *cdata, unsigned clen,
                       void *ubuf, unsigned ulen)
 {
                       const void *cdata, unsigned clen,
                       void *ubuf, unsigned ulen)
 {
-       unsigned num_position_slots;
+       unsigned num_offset_slots;
 
        LZMS_DEBUG("Initializing decompressor (clen=%u, ulen=%u)", clen, ulen);
 
 
        LZMS_DEBUG("Initializing decompressor (clen=%u, ulen=%u)", clen, ulen);
 
@@ -903,11 +886,11 @@ lzms_init_decompressor(struct lzms_decompressor *ctx,
         * backwards)  */
        lzms_input_bitstream_init(&ctx->is, cdata, clen / 2);
 
         * backwards)  */
        lzms_input_bitstream_init(&ctx->is, cdata, clen / 2);
 
-       /* Calculate the number of position slots needed for this compressed
+       /* Calculate the number of offset slots needed for this compressed
         * block.  */
         * block.  */
-       num_position_slots = lzms_get_position_slot(ulen - 1) + 1;
+       num_offset_slots = lzms_get_offset_slot(ulen - 1) + 1;
 
 
-       LZMS_DEBUG("Using %u position slots", num_position_slots);
+       LZMS_DEBUG("Using %u offset slots", num_offset_slots);
 
        /* Initialize Huffman decoders for each alphabet used in the compressed
         * representation.  */
 
        /* Initialize Huffman decoders for each alphabet used in the compressed
         * representation.  */
@@ -916,9 +899,9 @@ lzms_init_decompressor(struct lzms_decompressor *ctx,
                                  LZMS_LITERAL_CODE_REBUILD_FREQ);
 
        lzms_init_huffman_decoder(&ctx->lz_offset_decoder, &ctx->is,
                                  LZMS_LITERAL_CODE_REBUILD_FREQ);
 
        lzms_init_huffman_decoder(&ctx->lz_offset_decoder, &ctx->is,
-                                 lzms_position_slot_base,
-                                 lzms_extra_position_bits,
-                                 num_position_slots,
+                                 lzms_offset_slot_base,
+                                 lzms_extra_offset_bits,
+                                 num_offset_slots,
                                  LZMS_LZ_OFFSET_CODE_REBUILD_FREQ);
 
        lzms_init_huffman_decoder(&ctx->length_decoder, &ctx->is,
                                  LZMS_LZ_OFFSET_CODE_REBUILD_FREQ);
 
        lzms_init_huffman_decoder(&ctx->length_decoder, &ctx->is,
@@ -928,9 +911,9 @@ lzms_init_decompressor(struct lzms_decompressor *ctx,
                                  LZMS_LENGTH_CODE_REBUILD_FREQ);
 
        lzms_init_huffman_decoder(&ctx->delta_offset_decoder, &ctx->is,
                                  LZMS_LENGTH_CODE_REBUILD_FREQ);
 
        lzms_init_huffman_decoder(&ctx->delta_offset_decoder, &ctx->is,
-                                 lzms_position_slot_base,
-                                 lzms_extra_position_bits,
-                                 num_position_slots,
+                                 lzms_offset_slot_base,
+                                 lzms_extra_offset_bits,
+                                 num_offset_slots,
                                  LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ);
 
        lzms_init_huffman_decoder(&ctx->delta_power_decoder, &ctx->is,
                                  LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ);
 
        lzms_init_huffman_decoder(&ctx->delta_power_decoder, &ctx->is,
@@ -1007,7 +990,7 @@ lzms_decompress(const void *compressed_data, size_t compressed_size,
        }
 
        /* Handle the trivial case where nothing needs to be decompressed.
        }
 
        /* Handle the trivial case where nothing needs to be decompressed.
-        * (Necessary because a window of size 0 does not have a valid position
+        * (Necessary because a window of size 0 does not have a valid offset
         * slot.)  */
        if (uncompressed_size == 0)
                return 0;
         * slot.)  */
        if (uncompressed_size == 0)
                return 0;
@@ -1039,8 +1022,8 @@ lzms_create_decompressor(size_t max_block_size, void **ctx_ret)
        struct lzms_decompressor *ctx;
 
        /* The x86 post-processor requires that the uncompressed length fit into
        struct lzms_decompressor *ctx;
 
        /* The x86 post-processor requires that the uncompressed length fit into
-        * a signed 32-bit integer.  Also, the position slot table cannot be
-        * searched for a position of INT32_MAX or greater.  */
+        * a signed 32-bit integer.  Also, the offset slot table cannot be
+        * searched for an offset of INT32_MAX or greater.  */
        if (max_block_size >= INT32_MAX)
                return WIMLIB_ERR_INVALID_PARAM;
 
        if (max_block_size >= INT32_MAX)
                return WIMLIB_ERR_INVALID_PARAM;
 
@@ -1049,7 +1032,7 @@ lzms_create_decompressor(size_t max_block_size, void **ctx_ret)
        if (ctx == NULL)
                return WIMLIB_ERR_NOMEM;
 
        if (ctx == NULL)
                return WIMLIB_ERR_NOMEM;
 
-       /* Initialize position and length slot data if not done already.  */
+       /* Initialize offset and length slot data if not done already.  */
        lzms_init_slots();
 
        *ctx_ret = ctx;
        lzms_init_slots();
 
        *ctx_ret = ctx;
index 566161c3585abf4130ed265a270aa7e09c8cba8d..827b1b29657d436d0472fae98487b6d2d7f50f5c 100644 (file)
@@ -33,9 +33,9 @@
 #  include <emmintrin.h>
 #endif
 
 #  include <emmintrin.h>
 #endif
 
-/* Mapping: position slot => first match offset that uses that position slot.
+/* Mapping: offset slot => first match offset that uses that offset slot.
  */
  */
-const u32 lzx_position_base[LZX_MAX_POSITION_SLOTS] = {
+const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS] = {
        0      , 1      , 2      , 3      , 4      ,    /* 0  --- 4  */
        6      , 8      , 12     , 16     , 24     ,    /* 5  --- 9  */
        32     , 48     , 64     , 96     , 128    ,    /* 10 --- 14 */
        0      , 1      , 2      , 3      , 4      ,    /* 0  --- 4  */
        6      , 8      , 12     , 16     , 24     ,    /* 5  --- 9  */
        32     , 48     , 64     , 96     , 128    ,    /* 10 --- 14 */
@@ -49,10 +49,9 @@ const u32 lzx_position_base[LZX_MAX_POSITION_SLOTS] = {
        2097152                                         /* 50        */
 };
 
        2097152                                         /* 50        */
 };
 
-/* Mapping: position slot => how many extra bits must be read and added to the
- * corresponding position base to decode the match offset.  */
-#ifdef USE_LZX_EXTRA_BITS_ARRAY
-const u8 lzx_extra_bits[LZX_MAX_POSITION_SLOTS] = {
+/* Mapping: offset slot => how many extra bits must be read and added to the
+ * corresponding offset slot base to decode the match offset.  */
+const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS] = {
        0 , 0 , 0 , 0 , 1 ,
        1 , 2 , 2 , 3 , 3 ,
        4 , 4 , 5 , 5 , 6 ,
        0 , 0 , 0 , 0 , 1 ,
        1 , 2 , 2 , 3 , 3 ,
        4 , 4 , 5 , 5 , 6 ,
@@ -65,7 +64,6 @@ const u8 lzx_extra_bits[LZX_MAX_POSITION_SLOTS] = {
        17, 17, 17, 17, 17,
        17
 };
        17, 17, 17, 17, 17,
        17
 };
-#endif
 
 /* Round the specified compression block size (not LZX block size) up to the
  * next valid LZX window size, and return its order (log2).  Or, if the block
 
 /* Round the specified compression block size (not LZX block size) up to the
  * next valid LZX window size, and return its order (log2).  Or, if the block
@@ -96,31 +94,31 @@ lzx_get_num_main_syms(unsigned window_order)
        /* NOTE: the calculation *should* be as follows:
         *
         * u32 max_offset = window_size - LZX_MIN_MATCH_LEN;
        /* NOTE: the calculation *should* be as follows:
         *
         * u32 max_offset = window_size - LZX_MIN_MATCH_LEN;
-        * u32 max_formatted_offset = max_offset + LZX_OFFSET_OFFSET;
-        * u32 num_position_slots = 1 + lzx_get_position_slot_raw(max_formatted_offset);
+        * u32 max_adjusted_offset = max_offset + LZX_OFFSET_OFFSET;
+        * u32 num_offset_slots = 1 + lzx_get_offset_slot_raw(max_adjusted_offset);
         *
         * However since LZX_MIN_MATCH_LEN == LZX_OFFSET_OFFSET, we would get
         *
         * However since LZX_MIN_MATCH_LEN == LZX_OFFSET_OFFSET, we would get
-        * max_formatted_offset == window_size, which would bump the number of
-        * position slots up by 1 since every valid LZX window size is equal to
-        * a position base value.  The format doesn't do this, and instead
+        * max_adjusted_offset == window_size, which would bump the number of
+        * offset slots up by 1 since every valid LZX window size is equal to a
+        * offset slot base value.  The format doesn't do this, and instead
         * disallows matches with minimum length and maximum offset.  This sets
         * disallows matches with minimum length and maximum offset.  This sets
-        * max_formatted_offset = window_size - 1, so instead we must calculate:
+        * max_adjusted_offset = window_size - 1, so instead we must calculate:
         *
         *
-        * num_position_slots = 1 + lzx_get_position_slot_raw(window_size - 1);
+        * num_offset_slots = 1 + lzx_get_offset_slot_raw(window_size - 1);
         *
         * ... which is the same as
         *
         *
         * ... which is the same as
         *
-        * num_position_slots = lzx_get_position_slot_raw(window_size);
+        * num_offset_slots = lzx_get_offset_slot_raw(window_size);
         *
         *
-        * ... since every valid window size is equal to a position base value.
+        * ... since every valid window size is equal to an offset base value.
         */
         */
-       unsigned num_position_slots = lzx_get_position_slot_raw(window_size);
+       unsigned num_offset_slots = lzx_get_offset_slot_raw(window_size);
 
        /* Now calculate the number of main symbols as LZX_NUM_CHARS literal
 
        /* Now calculate the number of main symbols as LZX_NUM_CHARS literal
-        * symbols, plus 8 symbols per position slot (since there are 8 possible
-        * length headers, and we need all (position slot, length header)
+        * symbols, plus 8 symbols per offset slot (since there are 8 possible
+        * length headers, and we need all (offset slot, length header)
         * combinations).  */
         * combinations).  */
-       return LZX_NUM_CHARS + (num_position_slots << 3);
+       return LZX_NUM_CHARS + (num_offset_slots << 3);
 }
 
 static void
 }
 
 static void
index bc50a858bc9b62cf26d9d1b03d7b822591a1eb33..a9745b62c8a74973114e1088a358468d1123d8cc 100644 (file)
 
 
 /*
 
 
 /*
- * This file contains a compressor for the LZX ("Lempel-Ziv eXtended"?)
- * compression format, as used in the WIM (Windows IMaging) file format.  This
- * code may need some slight modifications to be used outside of the WIM format.
- * In particular, in other situations the LZX block header might be slightly
- * different, and a sliding window rather than a fixed-size window might be
- * required.
+ * This file contains a compressor for the LZX ("Lempel-Ziv eXtended")
+ * compression format, as used in the WIM (Windows IMaging) file format.
  *
  *
- * ----------------------------------------------------------------------------
+ * Two different parsing algorithms are implemented: "near-optimal" and "lazy".
+ * "Near-optimal" is significantly slower than "lazy", but results in a better
+ * compression ratio.  The "near-optimal" algorithm is used at the default
+ * compression level.
  *
  *
- *                              Format Overview
+ * This file may need some slight modifications to be used outside of the WIM
+ * format.  In particular, in other situations the LZX block header might be
+ * slightly different, and a sliding window rather than a fixed-size window
+ * might be required.
  *
  *
- * The primary reference for LZX is the specification released by Microsoft.
- * However, the comments in lzx-decompress.c provide more information about LZX
- * and note some errors in the Microsoft specification.
- *
- * LZX shares many similarities with DEFLATE, the format used by zlib and gzip.
- * Both LZX and DEFLATE use LZ77 matching and Huffman coding.  Certain details
- * are quite similar, such as the method for storing Huffman codes.  However,
- * the main differences are:
+ * Note: LZX is a compression format derived from DEFLATE, the format used by
+ * zlib and gzip.  Both LZX and DEFLATE use LZ77 matching and Huffman coding.
+ * Certain details are quite similar, such as the method for storing Huffman
+ * codes.  However, the main differences are:
  *
  * - LZX preprocesses the data to attempt to make x86 machine code slightly more
  *   compressible before attempting to compress it further.
  *
  * - LZX uses a "main" alphabet which combines literals and matches, with the
  *   match symbols containing a "length header" (giving all or part of the match
  *
  * - LZX preprocesses the data to attempt to make x86 machine code slightly more
  *   compressible before attempting to compress it further.
  *
  * - LZX uses a "main" alphabet which combines literals and matches, with the
  *   match symbols containing a "length header" (giving all or part of the match
- *   length) and a "position slot" (giving, roughly speaking, the order of
+ *   length) and an "offset slot" (giving, roughly speaking, the order of
  *   magnitude of the match offset).
  *
  * - LZX does not have static Huffman blocks (that is, the kind with preset
  *   Huffman codes); however it does have two types of dynamic Huffman blocks
  *   ("verbatim" and "aligned").
  *
  *   magnitude of the match offset).
  *
  * - LZX does not have static Huffman blocks (that is, the kind with preset
  *   Huffman codes); however it does have two types of dynamic Huffman blocks
  *   ("verbatim" and "aligned").
  *
- * - LZX has a minimum match length of 2 rather than 3.
- *
- * - In LZX, match offsets 0 through 2 actually represent entries in an LRU
- *   queue of match offsets.  This is very useful for certain types of files,
- *   such as binary files that have repeating records.
- *
- * ----------------------------------------------------------------------------
- *
- *                           Algorithmic Overview
- *
- * At a high level, any implementation of LZX compression must operate as
- * follows:
- *
- * 1. Preprocess the input data to translate the targets of 32-bit x86 call
- *    instructions to absolute offsets.  (Actually, this is required for WIM,
- *    but might not be in other places LZX is used.)
- *
- * 2. Find a sequence of LZ77-style matches and literal bytes that expands to
- *    the preprocessed data.
- *
- * 3. Divide the match/literal sequence into one or more LZX blocks, each of
- *    which may be "uncompressed", "verbatim", or "aligned".
- *
- * 4. Output each LZX block.
- *
- * Step (1) is fairly straightforward.  It requires looking for 0xe8 bytes in
- * the input data and performing a translation on the 4 bytes following each
- * one.
- *
- * Step (4) is complicated, but it is mostly determined by the LZX format.  The
- * only real choice we have is what algorithm to use to build the length-limited
- * canonical Huffman codes.  See lzx_write_all_blocks() for details.
- *
- * That leaves steps (2) and (3) as where all the hard stuff happens.  Focusing
- * on step (2), we need to do LZ77-style parsing on the input data, or "window",
- * to divide it into a sequence of matches and literals.  Each position in the
- * window might have multiple matches associated with it, and we need to choose
- * which one, if any, to actually use.  Therefore, the problem can really be
- * divided into two areas of concern: (a) finding matches at a given position,
- * which we shall call "match-finding", and (b) choosing whether to use a
- * match or a literal at a given position, and if using a match, which one (if
- * there is more than one available).  We shall call this "match-choosing".  We
- * first consider match-finding, then match-choosing.
- *
- * ----------------------------------------------------------------------------
- *
- *                              Match-finding
- *
- * Given a position in the window, we want to find LZ77-style "matches" with
- * that position at previous positions in the window.  With LZX, the minimum
- * match length is 2 and the maximum match length is 257.  The only restriction
- * on offsets is that LZX does not allow the last 2 bytes of the window to match
- * the beginning of the window.
- *
- * There are a number of algorithms that can be used for this, including hash
- * chains, binary trees, and suffix arrays.  Binary trees generally work well
- * for LZX compression since it uses medium-size windows (2^15 to 2^21 bytes).
- * However, when compressing in a fast mode where many positions are skipped
- * (not searched for matches), hash chains are faster.
+ * - LZX has a minimum match length of 2 rather than 3.  Length 2 matches can be
+ *   useful, but generally only if the parser is smart about choosing them.
  *
  *
- * Since the match-finders are not specific to LZX, I will not explain them in
- * detail here.  Instead, see lz_hash_chains.c and lz_binary_trees.c.
- *
- * ----------------------------------------------------------------------------
- *
- *                              Match-choosing
- *
- * Usually, choosing the longest match is best because it encodes the most data
- * in that one item.  However, sometimes the longest match is not optimal
- * because (a) choosing a long match now might prevent using an even longer
- * match later, or (b) more generally, what we actually care about is the number
- * of bits it will ultimately take to output each match or literal, which is
- * actually dependent on the entropy encoding using by the underlying
- * compression format.  Consequently, a longer match usually, but not always,
- * takes fewer bits to encode than multiple shorter matches or literals that
- * cover the same data.
- *
- * This problem of choosing the truly best match/literal sequence is probably
- * impossible to solve efficiently when combined with entropy encoding.  If we
- * knew how many bits it takes to output each match/literal, then we could
- * choose the optimal sequence using shortest-path search a la Dijkstra's
- * algorithm.  However, with entropy encoding, the chosen match/literal sequence
- * affects its own encoding.  Therefore, we can't know how many bits it will
- * take to actually output any one match or literal until we have actually
- * chosen the full sequence of matches and literals.
- *
- * Notwithstanding the entropy encoding problem, we also aren't guaranteed to
- * choose the optimal match/literal sequence unless the match-finder (see
- * section "Match-finder") provides the match-chooser with all possible matches
- * at each position.  However, this is not computationally efficient.  For
- * example, there might be many matches of the same length, and usually (but not
- * always) the best choice is the one with the smallest offset.  So in practice,
- * it's fine to only consider the smallest offset for a given match length at a
- * given position.  (Actually, for LZX, it's also worth considering repeat
- * offsets.)
- *
- * In addition, as mentioned earlier, in LZX we have the choice of using
- * multiple blocks, each of which resets the Huffman codes.  This expands the
- * search space even further.  Therefore, to simplify the problem, we currently
- * we don't attempt to actually choose the LZX blocks based on the data.
- * Instead, we just divide the data into fixed-size blocks of LZX_DIV_BLOCK_SIZE
- * bytes each, and always use verbatim or aligned blocks (never uncompressed).
- * A previous version of this code recursively split the input data into
- * equal-sized blocks, up to a maximum depth, and chose the lowest-cost block
- * divisions.  However, this made compression much slower and did not actually
- * help very much.  It remains an open question whether a sufficiently fast and
- * useful block-splitting algorithm is possible for LZX.  Essentially the same
- * problem also applies to DEFLATE.  The Microsoft LZX compressor seemingly does
- * do block splitting, although I don't know how fast or useful it is,
- * specifically.
- *
- * Now, back to the entropy encoding problem.  The "solution" is to use an
- * iterative approach to compute a good, but not necessarily optimal,
- * match/literal sequence.  Start with a fixed assignment of symbol costs and
- * choose an "optimal" match/literal sequence based on those costs, using
- * shortest-path seach a la Dijkstra's algorithm.  Then, for each iteration of
- * the optimization, update the costs based on the entropy encoding of the
- * current match/literal sequence, then choose a new match/literal sequence
- * based on the updated costs.  Usually, the actual cost to output the current
- * match/literal sequence will decrease in each iteration until it converges on
- * a fixed point.  This result may not be the truly optimal match/literal
- * sequence, but it usually is much better than one chosen by doing a "greedy"
- * parse where we always chooe the longest match.
- *
- * An alternative to both greedy parsing and iterative, near-optimal parsing is
- * "lazy" parsing.  Briefly, "lazy" parsing considers just the longest match at
- * each position, but it waits to choose that match until it has also examined
- * the next position.  This is actually a useful approach; it's used by zlib,
- * for example.  Therefore, for fast compression we combine lazy parsing with
- * the hash chain max-finder.  For normal/high compression we combine
- * near-optimal parsing with the binary tree match-finder.
+ * - In LZX, offset slots 0 through 2 actually represent entries in an LRU queue
+ *   of match offsets.  This is very useful for certain types of files, such as
+ *   binary files that have repeating records.
  */
 
 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #endif
 
  */
 
 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #endif
 
-#include "wimlib/compressor_ops.h"
 #include "wimlib/compress_common.h"
 #include "wimlib/compress_common.h"
+#include "wimlib/compressor_ops.h"
 #include "wimlib/endianness.h"
 #include "wimlib/error.h"
 #include "wimlib/lz_mf.h"
 #include "wimlib/lz_repsearch.h"
 #include "wimlib/lzx.h"
 #include "wimlib/util.h"
 #include "wimlib/endianness.h"
 #include "wimlib/error.h"
 #include "wimlib/lz_mf.h"
 #include "wimlib/lz_repsearch.h"
 #include "wimlib/lzx.h"
 #include "wimlib/util.h"
+
 #include <string.h>
 #include <string.h>
+#include <limits.h>
 
 #define LZX_OPTIM_ARRAY_LENGTH 4096
 
 
 #define LZX_OPTIM_ARRAY_LENGTH 4096
 
 
 #define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1))
 
 
 #define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1))
 
-/* Codewords for the LZX main, length, and aligned offset Huffman codes  */
+struct lzx_compressor;
+
+/* Codewords for the LZX Huffman codes.  */
 struct lzx_codewords {
        u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
        u32 len[LZX_LENCODE_NUM_SYMBOLS];
        u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
 struct lzx_codewords {
        u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
        u32 len[LZX_LENCODE_NUM_SYMBOLS];
        u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
-/* Codeword lengths (in bits) for the LZX main, length, and aligned offset
- * Huffman codes.
- *
- * A 0 length means the codeword has zero frequency.
- */
+/* Codeword lengths (in bits) for the LZX Huffman codes.
+ * A zero length means the corresponding codeword has zero frequency.  */
 struct lzx_lens {
        u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
        u8 len[LZX_LENCODE_NUM_SYMBOLS];
        u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
 struct lzx_lens {
        u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
        u8 len[LZX_LENCODE_NUM_SYMBOLS];
        u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
-/* Costs for the LZX main, length, and aligned offset Huffman symbols.
- *
- * If a codeword has zero frequency, it must still be assigned some nonzero cost
- * --- generally a high cost, since even if it gets used in the next iteration,
- * it probably will not be used very many times.  */
+/* Estimated cost, in bits, to output each symbol in the LZX Huffman codes.  */
 struct lzx_costs {
        u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
        u8 len[LZX_LENCODE_NUM_SYMBOLS];
        u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
 struct lzx_costs {
        u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
        u8 len[LZX_LENCODE_NUM_SYMBOLS];
        u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
-/* The LZX main, length, and aligned offset Huffman codes  */
+/* Codewords and lengths for the LZX Huffman codes.  */
 struct lzx_codes {
        struct lzx_codewords codewords;
        struct lzx_lens lens;
 };
 
 struct lzx_codes {
        struct lzx_codewords codewords;
        struct lzx_lens lens;
 };
 
-/* Tables for tallying symbol frequencies in the three LZX alphabets  */
+/* Symbol frequency counters for the LZX Huffman codes.  */
 struct lzx_freqs {
        u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
        u32 len[LZX_LENCODE_NUM_SYMBOLS];
        u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
 struct lzx_freqs {
        u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
        u32 len[LZX_LENCODE_NUM_SYMBOLS];
        u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
-/* LZX intermediate match/literal format  */
+/* Intermediate LZX match/literal format  */
 struct lzx_item {
 struct lzx_item {
-       /* Bit     Description
-        *
-        * 31      1 if a match, 0 if a literal.
-        *
-        * 30-25   position slot.  This can be at most 50, so it will fit in 6
-        *         bits.
-        *
-        * 8-24    position footer.  This is the offset of the real formatted
-        *         offset from the position base.  This can be at most 17 bits
-        *         (since lzx_extra_bits[LZX_MAX_POSITION_SLOTS - 1] is 17).
-        *
-        * 0-7     length of match, minus 2.  This can be at most
-        *         (LZX_MAX_MATCH_LEN - 2) == 255, so it will fit in 8 bits.  */
-       u32 data;
-};
-
-/* Specification for an LZX block.  */
-struct lzx_block_spec {
-
-       /* One of the LZX_BLOCKTYPE_* constants indicating which type of this
-        * block.  */
-       int block_type;
 
 
-       /* 0-based position in the window at which this block starts.  */
-       u32 window_pos;
-
-       /* The number of bytes of uncompressed data this block represents.  */
-       u32 block_size;
-
-       /* The match/literal sequence for this block.  */
-       struct lzx_item *chosen_items;
-
-       /* The length of the @chosen_items sequence.  */
-       u32 num_chosen_items;
-
-       /* Huffman codes for this block.  */
-       struct lzx_codes codes;
+       /* Bits 0  -  9: Main symbol
+        * Bits 10 - 17: Length symbol
+        * Bits 18 - 22: Number of extra offset bits
+        * Bits 23+    : Extra offset bits  */
+       u64 data;
 };
 
 };
 
-struct lzx_compressor;
-
+/* Internal compression parameters  */
 struct lzx_compressor_params {
 struct lzx_compressor_params {
-       struct lz_match (*choose_item_func)(struct lzx_compressor *);
-       enum lz_mf_algo mf_algo;
+       u32 (*choose_items_for_block)(struct lzx_compressor *, u32, u32);
        u32 num_optim_passes;
        u32 num_optim_passes;
+       enum lz_mf_algo mf_algo;
        u32 min_match_length;
        u32 nice_match_length;
        u32 max_search_depth;
 };
 
        u32 min_match_length;
        u32 nice_match_length;
        u32 max_search_depth;
 };
 
-/* State of the LZX compressor.  */
-struct lzx_compressor {
+/*
+ * Match chooser position data:
+ *
+ * An array of these structures is used during the near-optimal match-choosing
+ * algorithm.  They correspond to consecutive positions in the window and are
+ * used to keep track of the cost to reach each position, and the match/literal
+ * choices that need to be chosen to reach that position.
+ */
+struct lzx_mc_pos_data {
 
 
-       /* The buffer of data to be compressed.
+       /* The cost, in bits, of the lowest-cost path that has been found to
+        * reach this position.  This can change as progressively lower cost
+        * paths are found to reach this position.  */
+       u32 cost;
+#define MC_INFINITE_COST UINT32_MAX
+
+       /* The match or literal that was taken to reach this position.  This can
+        * change as progressively lower cost paths are found to reach this
+        * position.
+        *
+        * This variable is divided into two bitfields.
         *
         *
-        * 0xe8 byte preprocessing is done directly on the data here before
-        * further compression.
+        * Literals:
+        *      Low bits are 1, high bits are the literal.
+        *
+        * Explicit offset matches:
+        *      Low bits are the match length, high bits are the offset plus 2.
+        *
+        * Repeat offset matches:
+        *      Low bits are the match length, high bits are the queue index.
+        */
+       u32 mc_item_data;
+#define MC_OFFSET_SHIFT 9
+#define MC_LEN_MASK ((1 << MC_OFFSET_SHIFT) - 1)
+
+       /* The state of the LZX recent match offsets queue at this position.
+        * This is filled in lazily, only after the minimum-cost path to this
+        * position is found.
         *
         *
-        * Note that this compressor does *not* use a real sliding window!!!!
-        * It's not needed in the WIM format, since every chunk is compressed
-        * independently.  This is by design, to allow random access to the
-        * chunks.  */
+        * Note: the way we handle this adaptive state in the "minimum-cost"
+        * parse is actually only an approximation.  It's possible for the
+        * globally optimal, minimum cost path to contain a prefix, ending at a
+        * position, where that path prefix is *not* the minimum cost path to
+        * that position.  This can happen if such a path prefix results in a
+        * different adaptive state which results in lower costs later.  We do
+        * not solve this problem; we only consider the lowest cost to reach
+        * each position, which seems to be an acceptable approximation.  */
+       struct lzx_lru_queue queue _aligned_attribute(16);
+
+} _aligned_attribute(16);
+
+/* State of the LZX compressor  */
+struct lzx_compressor {
+
+       /* Internal compression parameters  */
+       struct lzx_compressor_params params;
+
+       /* The preprocessed buffer of data being compressed  */
        u8 *cur_window;
 
        /* Number of bytes of data to be compressed, which is the number of
         * bytes of data in @cur_window that are actually valid.  */
        u32 cur_window_size;
 
        u8 *cur_window;
 
        /* Number of bytes of data to be compressed, which is the number of
         * bytes of data in @cur_window that are actually valid.  */
        u32 cur_window_size;
 
-       /* Allocated size of @cur_window.  */
-       u32 max_window_size;
-
        /* log2 order of the LZX window size for LZ match offset encoding
         * purposes.  Will be >= LZX_MIN_WINDOW_ORDER and <=
         * LZX_MAX_WINDOW_ORDER.
         *
        /* log2 order of the LZX window size for LZ match offset encoding
         * purposes.  Will be >= LZX_MIN_WINDOW_ORDER and <=
         * LZX_MAX_WINDOW_ORDER.
         *
-        * Note: 1 << @window_order is normally equal to @max_window_size, but
-        * it will be greater than @max_window_size in the event that the
-        * compressor was created with a non-power-of-2 block size.  (See
-        * lzx_get_window_order().)  */
+        * Note: 1 << @window_order is normally equal to @max_window_size,
+        * a.k.a. the allocated size of @cur_window, but it will be greater than
+        * @max_window_size in the event that the compressor was created with a
+        * non-power-of-2 block size.  (See lzx_get_window_order().)  */
        unsigned window_order;
 
        unsigned window_order;
 
-       /* Compression parameters.  */
-       struct lzx_compressor_params params;
+       /* Number of symbols in the main alphabet.  This depends on
+        * @window_order, since @window_order determines the maximum possible
+        * offset.  It does not, however, depend on the *actual* size of the
+        * current data buffer being processed, which might be less than 1 <<
+        * @window_order.  */
+       unsigned num_main_syms;
 
 
+       /* Lempel-Ziv match-finder  */
+       struct lz_mf *mf;
+
+       /* Match-finder wrapper functions and data for near-optimal parsing.
+        *
+        * When doing more than one match-choosing pass over the data, matches
+        * found by the match-finder are cached to achieve a slight speedup when
+        * the same matches are needed on subsequent passes.  This is suboptimal
+        * because different matches may be preferred with different cost
+        * models, but it is a very worthwhile speedup.  */
        unsigned (*get_matches_func)(struct lzx_compressor *, const struct lz_match **);
        void (*skip_bytes_func)(struct lzx_compressor *, unsigned n);
        unsigned (*get_matches_func)(struct lzx_compressor *, const struct lz_match **);
        void (*skip_bytes_func)(struct lzx_compressor *, unsigned n);
+       u32 match_window_pos;
+       u32 match_window_end;
+       struct lz_match *cached_matches;
+       struct lz_match *cache_ptr;
+       struct lz_match *cache_limit;
 
 
-       /* Number of symbols in the main alphabet (depends on the @window_order
-        * since it determines the maximum allowed offset).  */
-       unsigned num_main_syms;
+       /* Position data for near-optimal parsing.  */
+       struct lzx_mc_pos_data optimum[LZX_OPTIM_ARRAY_LENGTH + LZX_MAX_MATCH_LEN];
+
+       /* The cost model currently being used for near-optimal parsing.  */
+       struct lzx_costs costs;
 
        /* The current match offset LRU queue.  */
        struct lzx_lru_queue queue;
 
 
        /* The current match offset LRU queue.  */
        struct lzx_lru_queue queue;
 
-       /* Space for the sequences of matches/literals that were chosen for each
-        * block.  */
-       struct lzx_item *chosen_items;
-
-       /* Information about the LZX blocks the preprocessed input was divided
-        * into.  */
-       struct lzx_block_spec *block_specs;
-
-       /* Number of LZX blocks the input was divided into; a.k.a. the number of
-        * elements of @block_specs that are valid.  */
-       unsigned num_blocks;
-
-       /* This is simply filled in with zeroes and used to avoid special-casing
-        * the output of the first compressed Huffman code, which conceptually
-        * has a delta taken from a code with all symbols having zero-length
-        * codewords.  */
-       struct lzx_codes zero_codes;
-
-       /* The current cost model.  */
-       struct lzx_costs costs;
-
-       /* Lempel-Ziv match-finder.  */
-       struct lz_mf *mf;
+       /* Frequency counters for the current block.  */
+       struct lzx_freqs freqs;
 
 
-       /* Position in window of next match to return.  */
-       u32 match_window_pos;
+       /* The Huffman codes for the current and previous blocks.  */
+       struct lzx_codes codes[2];
 
 
-       /* The end-of-block position.  We can't allow any matches to span this
-        * position.  */
-       u32 match_window_end;
+       /* Which 'struct lzx_codes' is being used for the current block.  The
+        * other was used for the previous block (if this isn't the first
+        * block).  */
+       unsigned int codes_index;
 
 
-       /* When doing more than one match-choosing pass over the data, matches
-        * found by the match-finder are cached in the following array to
-        * achieve a slight speedup when the same matches are needed on
-        * subsequent passes.  This is suboptimal because different matches may
-        * be preferred with different cost models, but seems to be a worthwhile
-        * speedup.  */
-       struct lz_match *cached_matches;
-       struct lz_match *cache_ptr;
-       struct lz_match *cache_limit;
+       /* Dummy lengths that are always 0.  */
+       struct lzx_lens zero_lens;
 
 
-       /* Match-chooser state, used when doing near-optimal parsing.
-        *
-        * When matches have been chosen, optimum_cur_idx is set to the position
-        * in the window of the next match/literal to return and optimum_end_idx
-        * is set to the position in the window at the end of the last
-        * match/literal to return.  */
-       struct lzx_mc_pos_data *optimum;
-       unsigned optimum_cur_idx;
-       unsigned optimum_end_idx;
-
-       /* Previous match, used when doing lazy parsing.  */
-       struct lz_match prev_match;
-};
+       /* Matches/literals that were chosen for the current block.  */
+       struct lzx_item chosen_items[LZX_DIV_BLOCK_SIZE];
 
 
-/*
- * Match chooser position data:
- *
- * An array of these structures is used during the match-choosing algorithm.
- * They correspond to consecutive positions in the window and are used to keep
- * track of the cost to reach each position, and the match/literal choices that
- * need to be chosen to reach that position.
- */
-struct lzx_mc_pos_data {
-       /* The approximate minimum cost, in bits, to reach this position in the
-        * window which has been found so far.  */
-       u32 cost;
-#define MC_INFINITE_COST ((u32)~0UL)
-
-       /* The union here is just for clarity, since the fields are used in two
-        * slightly different ways.  Initially, the @prev structure is filled in
-        * first, and links go from later in the window to earlier in the
-        * window.  Later, @next structure is filled in and links go from
-        * earlier in the window to later in the window.  */
-       union {
-               struct {
-                       /* Position of the start of the match or literal that
-                        * was taken to get to this position in the approximate
-                        * minimum-cost parse.  */
-                       u32 link;
-
-                       /* Offset (as in an LZ (length, offset) pair) of the
-                        * match or literal that was taken to get to this
-                        * position in the approximate minimum-cost parse.  */
-                       u32 match_offset;
-               } prev;
-               struct {
-                       /* Position at which the match or literal starting at
-                        * this position ends in the minimum-cost parse.  */
-                       u32 link;
-
-                       /* Offset (as in an LZ (length, offset) pair) of the
-                        * match or literal starting at this position in the
-                        * approximate minimum-cost parse.  */
-                       u32 match_offset;
-               } next;
-       };
-
-       /* Adaptive state that exists after an approximate minimum-cost path to
-        * reach this position is taken.
-        *
-        * Note: we update this whenever we update the pending minimum-cost
-        * path.  This is in contrast to LZMA, which also has an optimal parser
-        * that maintains a repeat offset queue per position, but will only
-        * compute the queue once that position is actually reached in the
-        * parse, meaning that matches are being considered *starting* at that
-        * position.  However, the two methods seem to have approximately the
-        * same performance if appropriate optimizations are used.  Intuitively
-        * the LZMA method seems faster, but it actually suffers from 1-2 extra
-        * hard-to-predict branches at each position.  Probably it works better
-        * for LZMA than LZX because LZMA has a larger adaptive state than LZX,
-        * and the LZMA encoder considers more possibilities.  */
-       struct lzx_lru_queue queue;
+       /* Table mapping match offset => offset slot for small offsets  */
+#define LZX_NUM_FAST_OFFSETS 32768
+       u8 offset_slot_fast[LZX_NUM_FAST_OFFSETS];
 };
 
 };
 
-
 /*
  * Structure to keep track of the current state of sending bits to the
  * compressed output buffer.
 /*
  * Structure to keep track of the current state of sending bits to the
  * compressed output buffer.
@@ -519,7 +327,7 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, u32 size)
  * The bits are given by the low-order @num_bits bits of @bits.  Higher-order
  * bits in @bits cannot be set.  At most 17 bits can be written at once.
  *
  * The bits are given by the low-order @num_bits bits of @bits.  Higher-order
  * bits in @bits cannot be set.  At most 17 bits can be written at once.
  *
- * @max_bits is a compile-time constant that specifies the maximum number of
+ * @max_num_bits is a compile-time constant that specifies the maximum number of
  * bits that can ever be written at the call site.  Currently, it is used to
  * optimize away the conditional code for writing a second 16-bit coding unit
  * when writing fewer than 17 bits.
  * bits that can ever be written at the call site.  Currently, it is used to
  * optimize away the conditional code for writing a second 16-bit coding unit
  * when writing fewer than 17 bits.
@@ -527,7 +335,7 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, u32 size)
  * If the output buffer space is exhausted, then the bits will be ignored, and
  * lzx_flush_output() will return 0 when it gets called.
  */
  * If the output buffer space is exhausted, then the bits will be ignored, and
  * lzx_flush_output() will return 0 when it gets called.
  */
-static _always_inline_attribute void
+static inline void
 lzx_write_varbits(struct lzx_output_bitstream *os,
                  const u32 bits, const unsigned int num_bits,
                  const unsigned int max_num_bits)
 lzx_write_varbits(struct lzx_output_bitstream *os,
                  const u32 bits, const unsigned int num_bits,
                  const unsigned int max_num_bits)
@@ -567,7 +375,7 @@ lzx_write_varbits(struct lzx_output_bitstream *os,
 
 /* Use when @num_bits is a compile-time constant.  Otherwise use
  * lzx_write_varbits().  */
 
 /* Use when @num_bits is a compile-time constant.  Otherwise use
  * lzx_write_varbits().  */
-static _always_inline_attribute void
+static inline void
 lzx_write_bits(struct lzx_output_bitstream *os,
               const u32 bits, const unsigned int num_bits)
 {
 lzx_write_bits(struct lzx_output_bitstream *os,
               const u32 bits, const unsigned int num_bits)
 {
@@ -590,49 +398,12 @@ lzx_flush_output(struct lzx_output_bitstream *os)
        return (const u8 *)os->next - (const u8 *)os->start;
 }
 
        return (const u8 *)os->next - (const u8 *)os->start;
 }
 
-/* Returns the LZX position slot that corresponds to a given match offset,
- * taking into account the recent offset queue and updating it if the offset is
- * found in it.  */
-static unsigned
-lzx_get_position_slot(u32 offset, struct lzx_lru_queue *queue)
-{
-       unsigned position_slot;
-
-       /* See if the offset was recently used.  */
-       for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
-               if (offset == queue->R[i]) {
-                       /* Found it.  */
-
-                       /* Bring the repeat offset to the front of the
-                        * queue.  Note: this is, in fact, not a real
-                        * LRU queue because repeat matches are simply
-                        * swapped to the front.  */
-                       swap(queue->R[0], queue->R[i]);
-
-                       /* The resulting position slot is simply the first index
-                        * at which the offset was found in the queue.  */
-                       return i;
-               }
-       }
-
-       /* The offset was not recently used; look up its real position slot.  */
-       position_slot = lzx_get_position_slot_raw(offset + LZX_OFFSET_OFFSET);
-
-       /* Bring the new offset to the front of the queue.  */
-       for (int i = LZX_NUM_RECENT_OFFSETS - 1; i > 0; i--)
-               queue->R[i] = queue->R[i - 1];
-       queue->R[0] = offset;
-
-       return position_slot;
-}
-
 /* Build the main, length, and aligned offset Huffman codes used in LZX.
  *
  * This takes as input the frequency tables for each code and produces as output
  * a set of tables that map symbols to codewords and codeword lengths.  */
 static void
 /* Build the main, length, and aligned offset Huffman codes used in LZX.
  *
  * This takes as input the frequency tables for each code and produces as output
  * a set of tables that map symbols to codewords and codeword lengths.  */
 static void
-lzx_make_huffman_codes(const struct lzx_freqs *freqs,
-                      struct lzx_codes *codes,
+lzx_make_huffman_codes(const struct lzx_freqs *freqs, struct lzx_codes *codes,
                       unsigned num_main_syms)
 {
        make_canonical_huffman_code(num_main_syms,
                       unsigned num_main_syms)
 {
        make_canonical_huffman_code(num_main_syms,
@@ -654,100 +425,6 @@ lzx_make_huffman_codes(const struct lzx_freqs *freqs,
                                    codes->codewords.aligned);
 }
 
                                    codes->codewords.aligned);
 }
 
-/*
- * Output a precomputed LZX match.
- *
- * @os:
- *     The bitstream to which to write the match.
- * @ones_if_aligned
- *     A mask of all ones if the block is of type LZX_BLOCKTYPE_ALIGNED,
- *     otherwise 0.
- * @match:
- *     The match data.
- * @codes:
- *     Pointer to a structure that contains the codewords for the main, length,
- *     and aligned offset Huffman codes for the current LZX compressed block.
- */
-static void
-lzx_write_match(struct lzx_output_bitstream *os, unsigned ones_if_aligned,
-               struct lzx_item match, const struct lzx_codes *codes)
-{
-       unsigned match_len_minus_2 = match.data & 0xff;
-       u32 position_footer = (match.data >> 8) & 0x1ffff;
-       unsigned position_slot = (match.data >> 25) & 0x3f;
-       unsigned len_header;
-       unsigned len_footer;
-       unsigned main_symbol;
-       unsigned num_extra_bits;
-
-       /* If the match length is less than MIN_MATCH_LEN (= 2) +
-        * NUM_PRIMARY_LENS (= 7), the length header contains the match length
-        * minus MIN_MATCH_LEN, and there is no length footer.
-        *
-        * Otherwise, the length header contains NUM_PRIMARY_LENS, and the
-        * length footer contains the match length minus NUM_PRIMARY_LENS minus
-        * MIN_MATCH_LEN. */
-       if (match_len_minus_2 < LZX_NUM_PRIMARY_LENS) {
-               len_header = match_len_minus_2;
-       } else {
-               len_header = LZX_NUM_PRIMARY_LENS;
-               len_footer = match_len_minus_2 - LZX_NUM_PRIMARY_LENS;
-       }
-
-       /* Combine the position slot with the length header into a single symbol
-        * that will be encoded with the main code.
-        *
-        * The actual main symbol is offset by LZX_NUM_CHARS because values
-        * under LZX_NUM_CHARS are used to indicate a literal byte rather than a
-        * match.  */
-       main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
-
-       /* Output main symbol. */
-       lzx_write_varbits(os, codes->codewords.main[main_symbol],
-                         codes->lens.main[main_symbol],
-                         LZX_MAX_MAIN_CODEWORD_LEN);
-
-       /* If there is a length footer, output it using the
-        * length Huffman code. */
-       if (len_header == LZX_NUM_PRIMARY_LENS) {
-               lzx_write_varbits(os, codes->codewords.len[len_footer],
-                                 codes->lens.len[len_footer],
-                                 LZX_MAX_LEN_CODEWORD_LEN);
-       }
-
-       /* Output the position footer.  */
-
-       num_extra_bits = lzx_get_num_extra_bits(position_slot);
-
-       if ((num_extra_bits & ones_if_aligned) >= 3) {
-
-               /* Aligned offset blocks: The low 3 bits of the position footer
-                * are Huffman-encoded using the aligned offset code.  The
-                * remaining bits are output literally.  */
-
-               lzx_write_varbits(os,
-                                 position_footer >> 3, num_extra_bits - 3, 14);
-
-               lzx_write_varbits(os,
-                                 codes->codewords.aligned[position_footer & 7],
-                                 codes->lens.aligned[position_footer & 7],
-                                 LZX_MAX_ALIGNED_CODEWORD_LEN);
-       } else {
-               /* Verbatim blocks, or fewer than 3 extra bits:  All position
-                * footer bits are output literally.  */
-               lzx_write_varbits(os, position_footer, num_extra_bits, 17);
-       }
-}
-
-/* Output an LZX literal (encoded with the main Huffman code).  */
-static void
-lzx_write_literal(struct lzx_output_bitstream *os, unsigned literal,
-                 const struct lzx_codes *codes)
-{
-       lzx_write_varbits(os, codes->codewords.main[literal],
-                         codes->lens.main[literal], LZX_MAX_MAIN_CODEWORD_LEN);
-}
-
 static unsigned
 lzx_compute_precode_items(const u8 lens[restrict],
                          const u8 prev_lens[restrict],
 static unsigned
 lzx_compute_precode_items(const u8 lens[restrict],
                          const u8 prev_lens[restrict],
@@ -925,6 +602,59 @@ lzx_write_compressed_code(struct lzx_output_bitstream *os,
        }
 }
 
        }
 }
 
+/* Output a match or literal.  */
+static inline void
+lzx_write_item(struct lzx_output_bitstream *os, struct lzx_item item,
+              unsigned ones_if_aligned, const struct lzx_codes *codes)
+{
+       u64 data = item.data;
+       unsigned main_symbol;
+       unsigned len_symbol;
+       unsigned num_extra_bits;
+       u32 extra_bits;
+
+       main_symbol = data & 0x3FF;
+
+       lzx_write_varbits(os, codes->codewords.main[main_symbol],
+                         codes->lens.main[main_symbol],
+                         LZX_MAX_MAIN_CODEWORD_LEN);
+
+       if (main_symbol < LZX_NUM_CHARS)  /* Literal?  */
+               return;
+
+       len_symbol = (data >> 10) & 0xFF;
+
+       if (len_symbol != LZX_LENCODE_NUM_SYMBOLS) {
+               lzx_write_varbits(os, codes->codewords.len[len_symbol],
+                                 codes->lens.len[len_symbol],
+                                 LZX_MAX_LEN_CODEWORD_LEN);
+       }
+
+       num_extra_bits = (data >> 18) & 0x1F;
+       if (num_extra_bits == 0)  /* Small offset or repeat offset match?  */
+               return;
+
+       extra_bits = data >> 23;
+
+       /*if (block_type == LZX_BLOCKTYPE_ALIGNED && num_extra_bits >= 3) {*/
+       if ((num_extra_bits & ones_if_aligned) >= 3) {
+
+               /* Aligned offset blocks: The low 3 bits of the extra offset
+                * bits are Huffman-encoded using the aligned offset code.  The
+                * remaining bits are output literally.  */
+
+               lzx_write_varbits(os, extra_bits >> 3, num_extra_bits - 3, 14);
+
+               lzx_write_varbits(os, codes->codewords.aligned[extra_bits & 7],
+                                 codes->lens.aligned[extra_bits & 7],
+                                 LZX_MAX_ALIGNED_CODEWORD_LEN);
+       } else {
+               /* Verbatim blocks, or fewer than 3 extra bits:  All extra
+                * offset bits are output literally.  */
+               lzx_write_varbits(os, extra_bits, num_extra_bits, 17);
+       }
+}
+
 /*
  * Write all matches and literal bytes (which were precomputed) in an LZX
  * compressed block to the output bitstream in the final compressed
 /*
  * Write all matches and literal bytes (which were precomputed) in an LZX
  * compressed block to the output bitstream in the final compressed
@@ -950,18 +680,11 @@ lzx_write_items(struct lzx_output_bitstream *os, int block_type,
 {
        unsigned ones_if_aligned = 0U - (block_type == LZX_BLOCKTYPE_ALIGNED);
 
 {
        unsigned ones_if_aligned = 0U - (block_type == LZX_BLOCKTYPE_ALIGNED);
 
-       for (u32 i = 0; i < num_items; i++) {
-               /* The high bit of the 32-bit intermediate representation
-                * indicates whether the item is an actual LZ-style match (1) or
-                * a literal byte (0).  */
-               if (items[i].data & 0x80000000)
-                       lzx_write_match(os, ones_if_aligned, items[i], codes);
-               else
-                       lzx_write_literal(os, items[i].data, codes);
-       }
+       for (u32 i = 0; i < num_items; i++)
+               lzx_write_item(os, items[i], ones_if_aligned, codes);
 }
 
 }
 
-/* Write an LZX aligned offset or verbatim block to the output.  */
+/* Write an LZX aligned offset or verbatim block to the output bitstream.  */
 static void
 lzx_write_compressed_block(int block_type,
                           u32 block_size,
 static void
 lzx_write_compressed_block(int block_type,
                           u32 block_size,
@@ -970,7 +693,7 @@ lzx_write_compressed_block(int block_type,
                           struct lzx_item * chosen_items,
                           u32 num_chosen_items,
                           const struct lzx_codes * codes,
                           struct lzx_item * chosen_items,
                           u32 num_chosen_items,
                           const struct lzx_codes * codes,
-                          const struct lzx_codes * prev_codes,
+                          const struct lzx_lens * prev_lens,
                           struct lzx_output_bitstream * os)
 {
        LZX_ASSERT(block_type == LZX_BLOCKTYPE_ALIGNED ||
                           struct lzx_output_bitstream * os)
 {
        LZX_ASSERT(block_type == LZX_BLOCKTYPE_ALIGNED ||
@@ -1006,7 +729,7 @@ lzx_write_compressed_block(int block_type,
                lzx_write_bits(os, block_size & 0xFFFF, 16);
        }
 
                lzx_write_bits(os, block_size & 0xFFFF, 16);
        }
 
-       /* Output the aligned offset code.  */
+       /* If it's an aligned offset block, output the aligned offset code.  */
        if (block_type == LZX_BLOCKTYPE_ALIGNED) {
                for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
                        lzx_write_bits(os, codes->lens.aligned[i],
        if (block_type == LZX_BLOCKTYPE_ALIGNED) {
                for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
                        lzx_write_bits(os, codes->lens.aligned[i],
@@ -1016,238 +739,78 @@ lzx_write_compressed_block(int block_type,
 
        /* Output the main code (two parts).  */
        lzx_write_compressed_code(os, codes->lens.main,
 
        /* Output the main code (two parts).  */
        lzx_write_compressed_code(os, codes->lens.main,
-                                 prev_codes->lens.main,
+                                 prev_lens->main,
                                  LZX_NUM_CHARS);
        lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
                                  LZX_NUM_CHARS);
        lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
-                                 prev_codes->lens.main + LZX_NUM_CHARS,
+                                 prev_lens->main + LZX_NUM_CHARS,
                                  num_main_syms - LZX_NUM_CHARS);
 
        /* Output the length code.  */
        lzx_write_compressed_code(os, codes->lens.len,
                                  num_main_syms - LZX_NUM_CHARS);
 
        /* Output the length code.  */
        lzx_write_compressed_code(os, codes->lens.len,
-                                 prev_codes->lens.len,
+                                 prev_lens->len,
                                  LZX_LENCODE_NUM_SYMBOLS);
 
        /* Output the compressed matches and literals.  */
        lzx_write_items(os, block_type, chosen_items, num_chosen_items, codes);
 }
 
                                  LZX_LENCODE_NUM_SYMBOLS);
 
        /* Output the compressed matches and literals.  */
        lzx_write_items(os, block_type, chosen_items, num_chosen_items, codes);
 }
 
-/* Write out the LZX blocks that were computed.  */
-static void
-lzx_write_all_blocks(struct lzx_compressor *c, struct lzx_output_bitstream *os)
+/* Don't allow matches to span the end of an LZX block.  */
+static inline unsigned
+maybe_truncate_matches(struct lz_match matches[], unsigned num_matches,
+                      struct lzx_compressor *c)
 {
 {
+       if (c->match_window_end < c->cur_window_size && num_matches != 0) {
+               u32 limit = c->match_window_end - c->match_window_pos;
 
 
-       const struct lzx_codes *prev_codes = &c->zero_codes;
-       for (unsigned i = 0; i < c->num_blocks; i++) {
-               const struct lzx_block_spec *spec = &c->block_specs[i];
+               if (limit >= LZX_MIN_MATCH_LEN) {
 
 
-               lzx_write_compressed_block(spec->block_type,
-                                          spec->block_size,
-                                          c->window_order,
-                                          c->num_main_syms,
-                                          spec->chosen_items,
-                                          spec->num_chosen_items,
-                                          &spec->codes,
-                                          prev_codes,
-                                          os);
+                       unsigned i = num_matches - 1;
+                       do {
+                               if (matches[i].len >= limit) {
+                                       matches[i].len = limit;
 
 
-               prev_codes = &spec->codes;
+                                       /* Truncation might produce multiple
+                                        * matches with length 'limit'.  Keep at
+                                        * most 1.  */
+                                       num_matches = i + 1;
+                               }
+                       } while (i--);
+               } else {
+                       num_matches = 0;
+               }
        }
        }
+       return num_matches;
 }
 
 }
 
-/* Constructs an LZX match from a literal byte and updates the main code symbol
- * frequencies.  */
-static inline u32
-lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
+static unsigned
+lzx_get_matches_fillcache_singleblock(struct lzx_compressor *c,
+                                     const struct lz_match **matches_ret)
 {
 {
-       freqs->main[lit]++;
-       return (u32)lit;
-}
+       struct lz_match *cache_ptr;
+       struct lz_match *matches;
+       unsigned num_matches;
 
 
-/* Constructs an LZX match from an offset and a length, and updates the LRU
- * queue and the frequency of symbols in the main, length, and aligned offset
- * alphabets.  The return value is a 32-bit number that provides the match in an
- * intermediate representation documented below.  */
-static inline u32
-lzx_tally_match(unsigned match_len, u32 match_offset,
-               struct lzx_freqs *freqs, struct lzx_lru_queue *queue)
-{
-       unsigned position_slot;
-       u32 position_footer;
-       u32 len_header;
-       unsigned main_symbol;
-       unsigned len_footer;
-       unsigned adjusted_match_len;
-
-       LZX_ASSERT(match_len >= LZX_MIN_MATCH_LEN && match_len <= LZX_MAX_MATCH_LEN);
-
-       /* The match offset shall be encoded as a position slot (itself encoded
-        * as part of the main symbol) and a position footer.  */
-       position_slot = lzx_get_position_slot(match_offset, queue);
-       position_footer = (match_offset + LZX_OFFSET_OFFSET) &
-                               (((u32)1 << lzx_get_num_extra_bits(position_slot)) - 1);
-
-       /* The match length shall be encoded as a length header (itself encoded
-        * as part of the main symbol) and an optional length footer.  */
-       adjusted_match_len = match_len - LZX_MIN_MATCH_LEN;
-       if (adjusted_match_len < LZX_NUM_PRIMARY_LENS) {
-               /* No length footer needed.  */
-               len_header = adjusted_match_len;
+       cache_ptr = c->cache_ptr;
+       matches = cache_ptr + 1;
+       if (likely(cache_ptr <= c->cache_limit)) {
+               num_matches = lz_mf_get_matches(c->mf, matches);
+               cache_ptr->len = num_matches;
+               c->cache_ptr = matches + num_matches;
        } else {
        } else {
-               /* Length footer needed.  It will be encoded using the length
-                * code.  */
-               len_header = LZX_NUM_PRIMARY_LENS;
-               len_footer = adjusted_match_len - LZX_NUM_PRIMARY_LENS;
-               freqs->len[len_footer]++;
+               num_matches = 0;
        }
        }
-
-       /* Account for the main symbol.  */
-       main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
-
-       freqs->main[main_symbol]++;
-
-       /* In an aligned offset block, 3 bits of the position footer are output
-        * as an aligned offset symbol.  Account for this, although we may
-        * ultimately decide to output the block as verbatim.  */
-
-       /* The following check is equivalent to:
-        *
-        * if (lzx_extra_bits[position_slot] >= 3)
-        *
-        * Note that this correctly excludes position slots that correspond to
-        * recent offsets.  */
-       if (position_slot >= 8)
-               freqs->aligned[position_footer & 7]++;
-
-       /* Pack the position slot, position footer, and match length into an
-        * intermediate representation.  See `struct lzx_item' for details.
-        */
-       LZX_ASSERT(LZX_MAX_POSITION_SLOTS <= 64);
-       LZX_ASSERT(lzx_get_num_extra_bits(LZX_MAX_POSITION_SLOTS - 1) <= 17);
-       LZX_ASSERT(LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1 <= 256);
-
-       LZX_ASSERT(position_slot      <= (1U << (31 - 25)) - 1);
-       LZX_ASSERT(position_footer    <= (1U << (25 -  8)) - 1);
-       LZX_ASSERT(adjusted_match_len <= (1U << (8  -  0)) - 1);
-       return 0x80000000 |
-               (position_slot << 25) |
-               (position_footer << 8) |
-               (adjusted_match_len);
-}
-
-/* Returns the cost, in bits, to output a literal byte using the specified cost
- * model.  */
-static u32
-lzx_literal_cost(u8 c, const struct lzx_costs * costs)
-{
-       return costs->main[c];
+       c->match_window_pos++;
+       *matches_ret = matches;
+       return num_matches;
 }
 
 }
 
-/* Returns the cost, in bits, to output a repeat offset match of the specified
- * length and position slot (repeat index) using the specified cost model.  */
-static u32
-lzx_repmatch_cost(u32 len, unsigned position_slot, const struct lzx_costs *costs)
+static unsigned
+lzx_get_matches_fillcache_multiblock(struct lzx_compressor *c,
+                                    const struct lz_match **matches_ret)
 {
 {
-       unsigned len_header, main_symbol;
-       u32 cost = 0;
-
-       len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
-       main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
-
-       /* Account for main symbol.  */
-       cost += costs->main[main_symbol];
-
-       /* Account for extra length information.  */
-       if (len_header == LZX_NUM_PRIMARY_LENS)
-               cost += costs->len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
-
-       return cost;
-}
-
-/* Set the cost model @c->costs from the Huffman codeword lengths specified in
- * @lens.
- *
- * The cost model and codeword lengths are almost the same thing, but the
- * Huffman codewords with length 0 correspond to symbols with zero frequency
- * that still need to be assigned actual costs.  The specific values assigned
- * are arbitrary, but they should be fairly high (near the maximum codeword
- * length) to take into account the fact that uses of these symbols are expected
- * to be rare.  */
-static void
-lzx_set_costs(struct lzx_compressor *c, const struct lzx_lens * lens,
-             unsigned nostat)
-{
-       unsigned i;
-
-       /* Main code  */
-       for (i = 0; i < c->num_main_syms; i++)
-               c->costs.main[i] = lens->main[i] ? lens->main[i] : nostat;
-
-       /* Length code  */
-       for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
-               c->costs.len[i] = lens->len[i] ? lens->len[i] : nostat;
-
-       /* Aligned offset code  */
-       for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
-               c->costs.aligned[i] = lens->aligned[i] ? lens->aligned[i] : nostat / 2;
-}
-
-/* Don't allow matches to span the end of an LZX block.  */
-static inline u32
-maybe_truncate_matches(struct lz_match matches[], u32 num_matches,
-                      struct lzx_compressor *c)
-{
-       if (c->match_window_end < c->cur_window_size && num_matches != 0) {
-               u32 limit = c->match_window_end - c->match_window_pos;
-
-               if (limit >= LZX_MIN_MATCH_LEN) {
-
-                       u32 i = num_matches - 1;
-                       do {
-                               if (matches[i].len >= limit) {
-                                       matches[i].len = limit;
-
-                                       /* Truncation might produce multiple
-                                        * matches with length 'limit'.  Keep at
-                                        * most 1.  */
-                                       num_matches = i + 1;
-                               }
-                       } while (i--);
-               } else {
-                       num_matches = 0;
-               }
-       }
-       return num_matches;
-}
-
-static unsigned
-lzx_get_matches_fillcache_singleblock(struct lzx_compressor *c,
-                                     const struct lz_match **matches_ret)
-{
-       struct lz_match *cache_ptr;
-       struct lz_match *matches;
-       unsigned num_matches;
-
-       cache_ptr = c->cache_ptr;
-       matches = cache_ptr + 1;
-       if (likely(cache_ptr <= c->cache_limit)) {
-               num_matches = lz_mf_get_matches(c->mf, matches);
-               cache_ptr->len = num_matches;
-               c->cache_ptr = matches + num_matches;
-       } else {
-               num_matches = 0;
-       }
-       c->match_window_pos++;
-       *matches_ret = matches;
-       return num_matches;
-}
-
-static unsigned
-lzx_get_matches_fillcache_multiblock(struct lzx_compressor *c,
-                                    const struct lz_match **matches_ret)
-{
-       struct lz_match *cache_ptr;
-       struct lz_match *matches;
-       unsigned num_matches;
+       struct lz_match *cache_ptr;
+       struct lz_match *matches;
+       unsigned num_matches;
 
        cache_ptr = c->cache_ptr;
        matches = cache_ptr + 1;
 
        cache_ptr = c->cache_ptr;
        matches = cache_ptr + 1;
@@ -1334,6 +897,8 @@ lzx_get_matches_nocache_multiblock(struct lzx_compressor *c,
 /*
  * Find matches at the next position in the window.
  *
 /*
  * Find matches at the next position in the window.
  *
+ * This uses a wrapper function around the underlying match-finder.
+ *
  * Returns the number of matches found and sets *matches_ret to point to the
  * matches array.  The matches will be sorted by strictly increasing length and
  * offset.
  * Returns the number of matches found and sets *matches_ret to point to the
  * matches array.  The matches will be sorted by strictly increasing length and
  * offset.
@@ -1400,6 +965,8 @@ lzx_skip_bytes_nocache(struct lzx_compressor *c, unsigned n)
 /*
  * Skip the specified number of positions in the window (don't search for
  * matches at them).
 /*
  * Skip the specified number of positions in the window (don't search for
  * matches at them).
+ *
+ * This uses a wrapper function around the underlying match-finder.
  */
 static inline void
 lzx_skip_bytes(struct lzx_compressor *c, unsigned n)
  */
 static inline void
 lzx_skip_bytes(struct lzx_compressor *c, unsigned n)
@@ -1407,600 +974,915 @@ lzx_skip_bytes(struct lzx_compressor *c, unsigned n)
        return (*c->skip_bytes_func)(c, n);
 }
 
        return (*c->skip_bytes_func)(c, n);
 }
 
-/*
- * Reverse the linked list of near-optimal matches so that they can be returned
- * in forwards order.
- *
- * Returns the first match in the list.
- */
-static struct lz_match
-lzx_match_chooser_reverse_list(struct lzx_compressor *c, unsigned cur_pos)
+/* Tally, and optionally record, the specified literal byte.  */
+static inline void
+lzx_declare_literal(struct lzx_compressor *c, unsigned literal,
+                   struct lzx_item **next_chosen_item)
 {
 {
-       unsigned prev_link, saved_prev_link;
-       unsigned prev_match_offset, saved_prev_match_offset;
+       unsigned main_symbol = literal;
 
 
-       c->optimum_end_idx = cur_pos;
+       c->freqs.main[main_symbol]++;
 
 
-       saved_prev_link = c->optimum[cur_pos].prev.link;
-       saved_prev_match_offset = c->optimum[cur_pos].prev.match_offset;
+       if (next_chosen_item) {
+               *(*next_chosen_item)++ = (struct lzx_item) {
+                       .data = main_symbol,
+               };
+       }
+}
 
 
-       do {
-               prev_link = saved_prev_link;
-               prev_match_offset = saved_prev_match_offset;
+/* Tally, and optionally record, the specified repeat offset match.  */
+static inline void
+lzx_declare_repeat_offset_match(struct lzx_compressor *c,
+                               unsigned len, unsigned rep_index,
+                               struct lzx_item **next_chosen_item)
+{
+       unsigned len_header;
+       unsigned main_symbol;
+       unsigned len_symbol;
 
 
-               saved_prev_link = c->optimum[prev_link].prev.link;
-               saved_prev_match_offset = c->optimum[prev_link].prev.match_offset;
+       if (len - LZX_MIN_MATCH_LEN < LZX_NUM_PRIMARY_LENS) {
+               len_header = len - LZX_MIN_MATCH_LEN;
+               len_symbol = LZX_LENCODE_NUM_SYMBOLS;
+       } else {
+               len_header = LZX_NUM_PRIMARY_LENS;
+               len_symbol = len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS;
+               c->freqs.len[len_symbol]++;
+       }
+
+       main_symbol = LZX_NUM_CHARS + ((rep_index << 3) | len_header);
+
+       c->freqs.main[main_symbol]++;
+
+       if (next_chosen_item) {
+               *(*next_chosen_item)++ = (struct lzx_item) {
+                       .data = (u64)main_symbol | ((u64)len_symbol << 10),
+               };
+       }
+}
+
+/* Tally, and optionally record, the specified explicit offset match.  */
+static inline void
+lzx_declare_explicit_offset_match(struct lzx_compressor *c, unsigned len, u32 offset,
+                                 struct lzx_item **next_chosen_item)
+{
+       unsigned len_header;
+       unsigned main_symbol;
+       unsigned len_symbol;
+       unsigned offset_slot;
+       unsigned num_extra_bits;
+       u32 extra_bits;
+
+       if (len - LZX_MIN_MATCH_LEN < LZX_NUM_PRIMARY_LENS) {
+               len_header = len - LZX_MIN_MATCH_LEN;
+               len_symbol = LZX_LENCODE_NUM_SYMBOLS;
+       } else {
+               len_header = LZX_NUM_PRIMARY_LENS;
+               len_symbol = len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS;
+               c->freqs.len[len_symbol]++;
+       }
+
+       offset_slot = lzx_get_offset_slot_raw(offset + LZX_OFFSET_OFFSET);
 
 
-               c->optimum[prev_link].next.link = cur_pos;
-               c->optimum[prev_link].next.match_offset = prev_match_offset;
+       main_symbol = LZX_NUM_CHARS + ((offset_slot << 3) | len_header);
 
 
-               cur_pos = prev_link;
-       } while (cur_pos != 0);
+       c->freqs.main[main_symbol]++;
 
 
-       c->optimum_cur_idx = c->optimum[0].next.link;
+       if (offset_slot >= 8)
+               c->freqs.aligned[(offset + LZX_OFFSET_OFFSET) & 7]++;
 
 
-       return (struct lz_match)
-               { .len = c->optimum_cur_idx,
-                 .offset = c->optimum[0].next.match_offset,
+       if (next_chosen_item) {
+
+               num_extra_bits = lzx_extra_offset_bits[offset_slot];
+
+               extra_bits = (offset + LZX_OFFSET_OFFSET) -
+                            lzx_offset_slot_base[offset_slot];
+
+               *(*next_chosen_item)++ = (struct lzx_item) {
+                       .data = (u64)main_symbol |
+                               ((u64)len_symbol << 10) |
+                               ((u64)num_extra_bits << 18) |
+                               ((u64)extra_bits << 23),
                };
                };
+       }
 }
 
 }
 
-/*
- * Find the longest repeat offset match.
- *
- * If no match of at least LZX_MIN_MATCH_LEN bytes is found, then return 0.
+/* Tally, and optionally record, the specified match or literal.  */
+static inline void
+lzx_declare_item(struct lzx_compressor *c, u32 mc_item_data,
+                struct lzx_item **next_chosen_item)
+{
+       u32 len = mc_item_data & MC_LEN_MASK;
+       u32 offset_data = mc_item_data >> MC_OFFSET_SHIFT;
+
+       if (len == 1)
+               lzx_declare_literal(c, offset_data, next_chosen_item);
+       else if (offset_data < LZX_NUM_RECENT_OFFSETS)
+               lzx_declare_repeat_offset_match(c, len, offset_data,
+                                               next_chosen_item);
+       else
+               lzx_declare_explicit_offset_match(c, len,
+                                                 offset_data - LZX_OFFSET_OFFSET,
+                                                 next_chosen_item);
+}
+
+static inline void
+lzx_record_item_list(struct lzx_compressor *c,
+                    struct lzx_mc_pos_data *cur_optimum_ptr,
+                    struct lzx_item **next_chosen_item)
+{
+       struct lzx_mc_pos_data *end_optimum_ptr;
+       u32 saved_item;
+       u32 item;
+
+       /* The list is currently in reverse order (last item to first item).
+        * Reverse it.  */
+       end_optimum_ptr = cur_optimum_ptr;
+       saved_item = cur_optimum_ptr->mc_item_data;
+       do {
+               item = saved_item;
+               cur_optimum_ptr -= item & MC_LEN_MASK;
+               saved_item = cur_optimum_ptr->mc_item_data;
+               cur_optimum_ptr->mc_item_data = item;
+       } while (cur_optimum_ptr != c->optimum);
+
+       /* Walk the list of items from beginning to end, tallying and recording
+        * each item.  */
+       do {
+               lzx_declare_item(c, cur_optimum_ptr->mc_item_data, next_chosen_item);
+               cur_optimum_ptr += (cur_optimum_ptr->mc_item_data) & MC_LEN_MASK;
+       } while (cur_optimum_ptr != end_optimum_ptr);
+}
+
+static inline void
+lzx_tally_item_list(struct lzx_compressor *c, struct lzx_mc_pos_data *cur_optimum_ptr)
+{
+       /* Since we're just tallying the items, we don't need to reverse the
+        * list.  Processing the items in reverse order is fine.  */
+       do {
+               lzx_declare_item(c, cur_optimum_ptr->mc_item_data, NULL);
+               cur_optimum_ptr -= (cur_optimum_ptr->mc_item_data & MC_LEN_MASK);
+       } while (cur_optimum_ptr != c->optimum);
+}
+
+/* Tally, and optionally (if next_chosen_item != NULL) record, in order, all
+ * items in the current list of items found by the match-chooser.  */
+static void
+lzx_declare_item_list(struct lzx_compressor *c, struct lzx_mc_pos_data *cur_optimum_ptr,
+                     struct lzx_item **next_chosen_item)
+{
+       if (next_chosen_item)
+               lzx_record_item_list(c, cur_optimum_ptr, next_chosen_item);
+       else
+               lzx_tally_item_list(c, cur_optimum_ptr);
+}
+
+/* Set the cost model @c->costs from the Huffman codeword lengths specified in
+ * @lens.
  *
  *
- * If a match of at least LZX_MIN_MATCH_LEN bytes is found, then return its
- * length and set *slot_ret to the index of its offset in @queue.
- */
+ * The cost model and codeword lengths are almost the same thing, but the
+ * Huffman codewords with length 0 correspond to symbols with zero frequency
+ * that still need to be assigned actual costs.  The specific values assigned
+ * are arbitrary, but they should be fairly high (near the maximum codeword
+ * length) to take into account the fact that uses of these symbols are expected
+ * to be rare.  */
+static void
+lzx_set_costs(struct lzx_compressor *c, const struct lzx_lens * lens)
+{
+       unsigned i;
+
+       /* Main code  */
+       for (i = 0; i < c->num_main_syms; i++)
+               c->costs.main[i] = lens->main[i] ? lens->main[i] : 15;
+
+       /* Length code  */
+       for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
+               c->costs.len[i] = lens->len[i] ? lens->len[i] : 15;
+
+       /* Aligned offset code  */
+       for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
+               c->costs.aligned[i] = lens->aligned[i] ? lens->aligned[i] : 7;
+}
+
+/* Set default LZX Huffman symbol costs to bootstrap the iterative optimization
+ * algorithm.  */
+static void
+lzx_set_default_costs(struct lzx_costs * costs, unsigned num_main_syms)
+{
+       unsigned i;
+
+       /* Main code (part 1): Literal symbols  */
+       for (i = 0; i < LZX_NUM_CHARS; i++)
+               costs->main[i] = 8;
+
+       /* Main code (part 2): Match header symbols  */
+       for (; i < num_main_syms; i++)
+               costs->main[i] = 10;
+
+       /* Length code  */
+       for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
+               costs->len[i] = 8;
+
+       /* Aligned offset code  */
+       for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
+               costs->aligned[i] = 3;
+}
+
+/* Return the cost, in bits, to output a literal byte using the specified cost
+ * model.  */
 static inline u32
 static inline u32
-lzx_repsearch(const u8 * const strptr, const u32 bytes_remaining,
-             const struct lzx_lru_queue *queue, unsigned *slot_ret)
+lzx_literal_cost(unsigned literal, const struct lzx_costs * costs)
 {
 {
-       BUILD_BUG_ON(LZX_MIN_MATCH_LEN != 2);
-       return lz_repsearch(strptr, bytes_remaining, LZX_MAX_MATCH_LEN,
-                           queue->R, LZX_NUM_RECENT_OFFSETS, slot_ret);
+       return costs->main[literal];
 }
 
 }
 
-/*
- * lzx_choose_near_optimal_item() -
- *
- * Choose an approximately optimal match or literal to use at the next position
- * in the string, or "window", being LZ-encoded.
- *
- * This is based on algorithms used in 7-Zip, including the DEFLATE encoder
- * and the LZMA encoder, written by Igor Pavlov.
- *
- * Unlike a greedy parser that always takes the longest match, or even a "lazy"
- * parser with one match/literal look-ahead like zlib, the algorithm used here
- * may look ahead many matches/literals to determine the approximately optimal
- * match/literal to code next.  The motivation is that the compression ratio is
- * improved if the compressor can do things like use a shorter-than-possible
- * match in order to allow a longer match later, and also take into account the
- * estimated real cost of coding each match/literal based on the underlying
- * entropy encoding.
- *
- * Still, this is not a true optimal parser for several reasons:
- *
- * - Real compression formats use entropy encoding of the literal/match
- *   sequence, so the real cost of coding each match or literal is unknown until
- *   the parse is fully determined.  It can be approximated based on iterative
- *   parses, but the end result is not guaranteed to be globally optimal.
- *
- * - Very long matches are chosen immediately.  This is because locations with
- *   long matches are likely to have many possible alternatives that would cause
- *   slow optimal parsing, but also such locations are already highly
- *   compressible so it is not too harmful to just grab the longest match.
- *
- * - Not all possible matches at each location are considered because the
- *   underlying match-finder limits the number and type of matches produced at
- *   each position.  For example, for a given match length it's usually not
- *   worth it to only consider matches other than the lowest-offset match,
- *   except in the case of a repeat offset.
- *
- * - Although we take into account the adaptive state (in LZX, the recent offset
- *   queue), coding decisions made with respect to the adaptive state will be
- *   locally optimal but will not necessarily be globally optimal.  This is
- *   because the algorithm only keeps the least-costly path to get to a given
- *   location and does not take into account that a slightly more costly path
- *   could result in a different adaptive state that ultimately results in a
- *   lower global cost.
- *
- * - The array space used by this function is bounded, so in degenerate cases it
- *   is forced to start returning matches/literals before the algorithm has
- *   really finished.
- *
- * Each call to this function does one of two things:
- *
- * 1. Build a sequence of near-optimal matches/literals, up to some point, that
- *    will be returned by subsequent calls to this function, then return the
- *    first one.
- *
- * OR
- *
- * 2. Return the next match/literal previously computed by a call to this
- *    function.
- *
- * The return value is a (length, offset) pair specifying the match or literal
- * chosen.  For literals, the length is 0 or 1 and the offset is meaningless.
- */
-static struct lz_match
-lzx_choose_near_optimal_item(struct lzx_compressor *c)
+/* Return the cost, in bits, to output a match of the specified length and
+ * offset slot using the specified cost model.  Does not take into account
+ * extra offset bits.  */
+static inline u32
+lzx_match_cost_raw(unsigned len, unsigned offset_slot,
+                  const struct lzx_costs *costs)
 {
 {
-       unsigned num_matches;
-       const struct lz_match *matches;
-       struct lz_match match;
-       u32 longest_len;
-       u32 longest_rep_len;
-       unsigned longest_rep_slot;
-       unsigned cur_pos;
-       unsigned end_pos;
-       struct lzx_mc_pos_data *optimum = c->optimum;
-
-       if (c->optimum_cur_idx != c->optimum_end_idx) {
-               /* Case 2: Return the next match/literal already found.  */
-               match.len = optimum[c->optimum_cur_idx].next.link -
-                                   c->optimum_cur_idx;
-               match.offset = optimum[c->optimum_cur_idx].next.match_offset;
-
-               c->optimum_cur_idx = optimum[c->optimum_cur_idx].next.link;
-               return match;
+       u32 cost;
+       unsigned len_header;
+       unsigned main_symbol;
+
+       if (len - LZX_MIN_MATCH_LEN < LZX_NUM_PRIMARY_LENS) {
+               len_header = len - LZX_MIN_MATCH_LEN ;
+               cost = 0;
+       } else {
+               len_header = LZX_NUM_PRIMARY_LENS;
+
+               /* Account for length symbol.  */
+               cost = costs->len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
        }
 
        }
 
-       /* Case 1:  Compute a new list of matches/literals to return.  */
+       /* Account for main symbol.  */
+       main_symbol = LZX_NUM_CHARS + ((offset_slot << 3) | len_header);
+       cost += costs->main[main_symbol];
+
+       return cost;
+}
+
+/* Equivalent to lzx_match_cost_raw(), but assumes the length is small enough
+ * that it doesn't require a length symbol.  */
+static inline u32
+lzx_match_cost_raw_smalllen(unsigned len, unsigned offset_slot,
+                           const struct lzx_costs *costs)
+{
+       LZX_ASSERT(len < LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS);
+       return costs->main[LZX_NUM_CHARS +
+                          ((offset_slot << 3) | (len - LZX_MIN_MATCH_LEN))];
+}
+
+/*
+ * Consider coding the match at repeat offset index @rep_idx.  Consider each
+ * length from the minimum (2) to the full match length (@rep_len).
+ */
+static inline void
+lzx_consider_repeat_offset_match(struct lzx_compressor *c,
+                                struct lzx_mc_pos_data *cur_optimum_ptr,
+                                unsigned rep_len, unsigned rep_idx)
+{
+       u32 base_cost = cur_optimum_ptr->cost;
+       u32 cost;
+       unsigned len;
 
 
-       c->optimum_cur_idx = 0;
-       c->optimum_end_idx = 0;
+#if 1   /* Optimized version */
 
 
-       /* Search for matches at repeat offsets.  As a heuristic, we only keep
-        * the one with the longest match length.  */
-       if (likely(c->match_window_pos >= 1)) {
-               longest_rep_len = lzx_repsearch(&c->cur_window[c->match_window_pos],
-                                               c->match_window_end - c->match_window_pos,
-                                               &c->queue,
-                                               &longest_rep_slot);
+       if (rep_len < LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS) {
+               /* All lengths being considered are small.  */
+               len = 2;
+               do {
+                       cost = base_cost +
+                              lzx_match_cost_raw_smalllen(len, rep_idx, &c->costs);
+                       if (cost < (cur_optimum_ptr + len)->cost) {
+                               (cur_optimum_ptr + len)->mc_item_data =
+                                       (rep_idx << MC_OFFSET_SHIFT) | len;
+                               (cur_optimum_ptr + len)->cost = cost;
+                       }
+               } while (++len <= rep_len);
        } else {
        } else {
-               longest_rep_len = 0;
-       }
+               /* Some lengths being considered are small, and some are big.
+                * Start with the optimized loop for small lengths, then switch
+                * to the optimized loop for big lengths.  */
+               len = 2;
+               do {
+                       cost = base_cost +
+                              lzx_match_cost_raw_smalllen(len, rep_idx, &c->costs);
+                       if (cost < (cur_optimum_ptr + len)->cost) {
+                               (cur_optimum_ptr + len)->mc_item_data =
+                                       (rep_idx << MC_OFFSET_SHIFT) | len;
+                               (cur_optimum_ptr + len)->cost = cost;
+                       }
+               } while (++len < LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS);
 
 
-       /* If there's a long match with a repeat offset, choose it immediately.  */
-       if (longest_rep_len >= c->params.nice_match_length) {
-               lzx_skip_bytes(c, longest_rep_len);
-               return (struct lz_match) {
-                       .len = longest_rep_len,
-                       .offset = c->queue.R[longest_rep_slot],
-               };
+               /* The main symbol is now fixed.  */
+               base_cost += c->costs.main[LZX_NUM_CHARS +
+                                          ((rep_idx << 3) | LZX_NUM_PRIMARY_LENS)];
+               do {
+                       cost = base_cost +
+                              c->costs.len[len - LZX_MIN_MATCH_LEN -
+                                           LZX_NUM_PRIMARY_LENS];
+                       if (cost < (cur_optimum_ptr + len)->cost) {
+                               (cur_optimum_ptr + len)->mc_item_data =
+                                       (rep_idx << MC_OFFSET_SHIFT) | len;
+                               (cur_optimum_ptr + len)->cost = cost;
+                       }
+               } while (++len <= rep_len);
        }
 
        }
 
-       /* Find other matches.  */
-       num_matches = lzx_get_matches(c, &matches);
+#else   /* Unoptimized version  */
 
 
-       /* If there's a long match, choose it immediately.  */
-       if (num_matches) {
-               longest_len = matches[num_matches - 1].len;
-               if (longest_len >= c->params.nice_match_length) {
-                       lzx_skip_bytes(c, longest_len - 1);
-                       return matches[num_matches - 1];
+       len = 2;
+       do {
+               cost = base_cost +
+                      lzx_match_cost_raw(len, rep_idx, &c->costs);
+               if (cost < (cur_optimum_ptr + len)->cost) {
+                       (cur_optimum_ptr + len)->mc_item_data =
+                               (rep_idx << MC_OFFSET_SHIFT) | len;
+                       (cur_optimum_ptr + len)->cost = cost;
                }
                }
+       } while (++len <= rep_len);
+#endif
+}
+
+/*
+ * Consider coding each match in @matches as an explicit offset match.
+ *
+ * @matches must be sorted by strictly increasing length and strictly
+ * increasing offset.  This is guaranteed by the match-finder.
+ *
+ * We consider each length from the minimum (2) to the longest
+ * (matches[num_matches - 1].len).  For each length, we consider only
+ * the smallest offset for which that length is available.  Although
+ * this is not guaranteed to be optimal due to the possibility of a
+ * larger offset costing less than a smaller offset to code, this is a
+ * very useful heuristic.
+ */
+static inline void
+lzx_consider_explicit_offset_matches(struct lzx_compressor *c,
+                                    struct lzx_mc_pos_data *cur_optimum_ptr,
+                                    const struct lz_match matches[],
+                                    unsigned num_matches)
+{
+       LZX_ASSERT(num_matches > 0);
+
+       unsigned i;
+       unsigned len;
+       unsigned offset_slot;
+       u32 position_cost;
+       u32 cost;
+       u32 offset_data;
+
+
+#if 1  /* Optimized version */
+
+       if (matches[num_matches - 1].offset < LZX_NUM_FAST_OFFSETS) {
+
+               /*
+                * Offset is small; the offset slot can be looked up directly in
+                * c->offset_slot_fast.
+                *
+                * Additional optimizations:
+                *
+                * - Since the offset is small, it falls in the exponential part
+                *   of the offset slot bases and the number of extra offset
+                *   bits can be calculated directly as (offset_slot >> 1) - 1.
+                *
+                * - Just consider the number of extra offset bits; don't
+                *   account for the aligned offset code.  Usually this has
+                *   almost no effect on the compression ratio.
+                *
+                * - Start out in a loop optimized for small lengths.  When the
+                *   length becomes high enough that a length symbol will be
+                *   needed, jump into a loop optimized for big lengths.
+                */
+
+               LZX_ASSERT(offset_slot <= 37); /* for extra bits formula  */
+
+               len = 2;
+               i = 0;
+               do {
+                       offset_slot = c->offset_slot_fast[matches[i].offset];
+                       position_cost = cur_optimum_ptr->cost +
+                                       ((offset_slot >> 1) - 1);
+                       offset_data = matches[i].offset + LZX_OFFSET_OFFSET;
+                       do {
+                               if (len >= LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS)
+                                       goto biglen;
+                               cost = position_cost +
+                                      lzx_match_cost_raw_smalllen(len, offset_slot,
+                                                                  &c->costs);
+                               if (cost < (cur_optimum_ptr + len)->cost) {
+                                       (cur_optimum_ptr + len)->cost = cost;
+                                       (cur_optimum_ptr + len)->mc_item_data =
+                                               (offset_data << MC_OFFSET_SHIFT) | len;
+                               }
+                       } while (++len <= matches[i].len);
+               } while (++i != num_matches);
+
+               return;
+
+               do {
+                       offset_slot = c->offset_slot_fast[matches[i].offset];
+       biglen:
+                       position_cost = cur_optimum_ptr->cost +
+                                       ((offset_slot >> 1) - 1) +
+                                       c->costs.main[LZX_NUM_CHARS +
+                                                     ((offset_slot << 3) |
+                                                      LZX_NUM_PRIMARY_LENS)];
+                       offset_data = matches[i].offset + LZX_OFFSET_OFFSET;
+                       do {
+                               cost = position_cost +
+                                      c->costs.len[len - LZX_MIN_MATCH_LEN -
+                                                   LZX_NUM_PRIMARY_LENS];
+                               if (cost < (cur_optimum_ptr + len)->cost) {
+                                       (cur_optimum_ptr + len)->cost = cost;
+                                       (cur_optimum_ptr + len)->mc_item_data =
+                                               (offset_data << MC_OFFSET_SHIFT) | len;
+                               }
+                       } while (++len <= matches[i].len);
+               } while (++i != num_matches);
        } else {
        } else {
-               longest_len = 1;
+               len = 2;
+               i = 0;
+               do {
+                       offset_data = matches[i].offset + LZX_OFFSET_OFFSET;
+                       offset_slot = lzx_get_offset_slot_raw(offset_data);
+                       position_cost = cur_optimum_ptr->cost +
+                                       lzx_extra_offset_bits[offset_slot];
+                       do {
+                               cost = position_cost +
+                                      lzx_match_cost_raw(len, offset_slot, &c->costs);
+                               if (cost < (cur_optimum_ptr + len)->cost) {
+                                       (cur_optimum_ptr + len)->cost = cost;
+                                       (cur_optimum_ptr + len)->mc_item_data =
+                                               (offset_data << MC_OFFSET_SHIFT) | len;
+                               }
+                       } while (++len <= matches[i].len);
+               } while (++i != num_matches);
        }
 
        }
 
-       /* Calculate the cost to reach the next position by coding a literal.  */
-       optimum[1].queue = c->queue;
-       optimum[1].cost = lzx_literal_cost(c->cur_window[c->match_window_pos - 1],
-                                             &c->costs);
-       optimum[1].prev.link = 0;
+#else  /* Unoptimized version */
 
 
-       /* Calculate the cost to reach any position up to and including that
-        * reached by the longest match.
-        *
-        * Note: We consider only the lowest-offset match that reaches each
-        * position.
-        *
-        * Note: Some of the cost calculation stays the same for each offset,
-        * regardless of how many lengths it gets used for.  Therefore, to
-        * improve performance, we hand-code the cost calculation instead of
-        * calling lzx_match_cost() to do a from-scratch cost evaluation at each
-        * length.  */
-       for (unsigned i = 0, len = 2; i < num_matches; i++) {
-               u32 offset;
-               struct lzx_lru_queue queue;
-               u32 position_cost;
-               unsigned position_slot;
-               unsigned num_extra_bits;
-
-               offset = matches[i].offset;
-               queue = c->queue;
-               position_cost = 0;
-
-               position_slot = lzx_get_position_slot(offset, &queue);
-               num_extra_bits = lzx_get_num_extra_bits(position_slot);
+       unsigned num_extra_bits;
+
+       len = 2;
+       i = 0;
+       do {
+               offset_data = matches[i].offset + LZX_OFFSET_OFFSET;
+               position_cost = cur_optimum_ptr->cost;
+               offset_slot = lzx_get_offset_slot_raw(offset_data);
+               num_extra_bits = lzx_extra_offset_bits[offset_slot];
                if (num_extra_bits >= 3) {
                        position_cost += num_extra_bits - 3;
                if (num_extra_bits >= 3) {
                        position_cost += num_extra_bits - 3;
-                       position_cost += c->costs.aligned[(offset + LZX_OFFSET_OFFSET) & 7];
+                       position_cost += c->costs.aligned[offset_data & 7];
                } else {
                        position_cost += num_extra_bits;
                }
                } else {
                        position_cost += num_extra_bits;
                }
-
                do {
                do {
-                       u32 cost;
-                       unsigned len_header;
-                       unsigned main_symbol;
-
-                       cost = position_cost;
-
-                       if (len - LZX_MIN_MATCH_LEN < LZX_NUM_PRIMARY_LENS) {
-                               len_header = len - LZX_MIN_MATCH_LEN;
-                       } else {
-                               len_header = LZX_NUM_PRIMARY_LENS;
-                               cost += c->costs.len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
+                       cost = position_cost +
+                              lzx_match_cost_raw(len, offset_slot, &c->costs);
+                       if (cost < (cur_optimum_ptr + len)->cost) {
+                               (cur_optimum_ptr + len)->cost = cost;
+                               (cur_optimum_ptr + len)->mc_item_data =
+                                       (offset_data << MC_OFFSET_SHIFT) | len;
                        }
                        }
-
-                       main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
-                       cost += c->costs.main[main_symbol];
-
-                       optimum[len].queue = queue;
-                       optimum[len].prev.link = 0;
-                       optimum[len].prev.match_offset = offset;
-                       optimum[len].cost = cost;
                } while (++len <= matches[i].len);
                } while (++len <= matches[i].len);
-       }
-       end_pos = longest_len;
+       } while (++i != num_matches);
+#endif
+}
 
 
-       if (longest_rep_len) {
+/*
+ * Search for repeat offset matches with the current position.
+ */
+static inline unsigned
+lzx_repsearch(const u8 * const strptr, const u32 bytes_remaining,
+             const struct lzx_lru_queue *queue, unsigned *rep_max_idx_ret)
+{
+       BUILD_BUG_ON(LZX_NUM_RECENT_OFFSETS != 3);
+       return lz_repsearch3(strptr, min(bytes_remaining, LZX_MAX_MATCH_LEN),
+                            queue->R, rep_max_idx_ret);
+}
 
 
-               LZX_ASSERT(longest_rep_len >= LZX_MIN_MATCH_LEN);
+/*
+ * The main near-optimal parsing routine.
+ *
+ * Briefly, the algorithm does an approximate minimum-cost path search to find a
+ * "near-optimal" sequence of matches and literals to output, based on the
+ * current cost model.  The algorithm steps forward, position by position (byte
+ * by byte), and updates the minimum cost path to reach each later position that
+ * can be reached using a match or literal from the current position.  This is
+ * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
+ * the graph edges are possible matches/literals to code, and the cost of each
+ * edge is the estimated number of bits that will be required to output the
+ * corresponding match or literal.  But one difference is that we actually
+ * compute the lowest-cost path in pieces, where each piece is terminated when
+ * there are no choices to be made.
+ *
+ * This function will run this algorithm on the portion of the window from
+ * &c->cur_window[c->match_window_pos] to &c->cur_window[c->match_window_end].
+ *
+ * On entry, c->queue must be the current state of the match offset LRU queue,
+ * and c->costs must be the current cost model to use for Huffman symbols.
+ *
+ * On exit, c->queue will be the state that the LRU queue would be in if the
+ * chosen items were to be coded.
+ *
+ * If next_chosen_item != NULL, then all items chosen will be recorded (saved in
+ * the chosen_items array).  Otherwise, all items chosen will only be tallied
+ * (symbol frequencies tallied in c->freqs).
+ */
+static void
+lzx_optim_pass(struct lzx_compressor *c, struct lzx_item **next_chosen_item)
+{
+       const u8 *block_end;
+       struct lzx_lru_queue *begin_queue;
+       const u8 *window_ptr;
+       struct lzx_mc_pos_data *cur_optimum_ptr;
+       struct lzx_mc_pos_data *end_optimum_ptr;
+       const struct lz_match *matches;
+       unsigned num_matches;
+       unsigned longest_len;
+       unsigned rep_max_len;
+       unsigned rep_max_idx;
+       unsigned literal;
+       unsigned len;
+       u32 cost;
+       u32 offset_data;
 
 
-               u32 cost;
+       block_end = &c->cur_window[c->match_window_end];
+       begin_queue = &c->queue;
+begin:
+       /* Start building a new list of items, which will correspond to the next
+        * piece of the overall minimum-cost path.
+        *
+        * *begin_queue is the current state of the match offset LRU queue.  */
 
 
-               while (end_pos < longest_rep_len)
-                       optimum[++end_pos].cost = MC_INFINITE_COST;
+       window_ptr = &c->cur_window[c->match_window_pos];
 
 
-               cost = lzx_repmatch_cost(longest_rep_len, longest_rep_slot,
-                                        &c->costs);
-               if (cost <= optimum[longest_rep_len].cost) {
-                       optimum[longest_rep_len].queue = c->queue;
-                       swap(optimum[longest_rep_len].queue.R[0],
-                            optimum[longest_rep_len].queue.R[longest_rep_slot]);
-                       optimum[longest_rep_len].prev.link = 0;
-                       optimum[longest_rep_len].prev.match_offset =
-                               optimum[longest_rep_len].queue.R[0];
-                       optimum[longest_rep_len].cost = cost;
-               }
+       if (window_ptr == block_end) {
+               c->queue = *begin_queue;
+               return;
        }
 
        }
 
-       /* Step forward, calculating the estimated minimum cost to reach each
-        * position.  The algorithm may find multiple paths to reach each
-        * position; only the lowest-cost path is saved.
-        *
-        * The progress of the parse is tracked in the @optimum array, which for
-        * each position contains the minimum cost to reach that position, the
-        * index of the start of the match/literal taken to reach that position
-        * through the minimum-cost path, the offset of the match taken (not
-        * relevant for literals), and the adaptive state that will exist at
-        * that position after the minimum-cost path is taken.  The @cur_pos
-        * variable stores the position at which the algorithm is currently
-        * considering coding choices, and the @end_pos variable stores the
-        * greatest position at which the costs of coding choices have been
-        * saved.
-        *
-        * The loop terminates when any one of the following conditions occurs:
-        *
-        * 1. A match with length greater than or equal to @nice_match_length is
-        *    found.  When this occurs, the algorithm chooses this match
-        *    unconditionally, and consequently the near-optimal match/literal
-        *    sequence up to and including that match is fully determined and it
-        *    can begin returning the match/literal list.
-        *
-        * 2. @cur_pos reaches a position not overlapped by a preceding match.
-        *    In such cases, the near-optimal match/literal sequence up to
-        *    @cur_pos is fully determined and it can begin returning the
-        *    match/literal list.
-        *
-        * 3. Failing either of the above in a degenerate case, the loop
-        *    terminates when space in the @optimum array is exhausted.
-        *    This terminates the algorithm and forces it to start returning
-        *    matches/literals even though they may not be globally optimal.
-        *
-        * Upon loop termination, a nonempty list of matches/literals will have
-        * been produced and stored in the @optimum array.  These
-        * matches/literals are linked in reverse order, so the last thing this
-        * function does is reverse this list and return the first
-        * match/literal, leaving the rest to be returned immediately by
-        * subsequent calls to this function.
-        */
-       cur_pos = 0;
+       cur_optimum_ptr = c->optimum;
+       cur_optimum_ptr->cost = 0;
+       cur_optimum_ptr->queue = *begin_queue;
+
+       end_optimum_ptr = cur_optimum_ptr;
+
+       /* The following loop runs once for each per byte in the window, except
+        * in a couple shortcut cases.  */
        for (;;) {
        for (;;) {
-               u32 cost;
-
-               /* Advance to next position.  */
-               cur_pos++;
-
-               /* Check termination conditions (2) and (3) noted above.  */
-               if (cur_pos == end_pos || cur_pos == LZX_OPTIM_ARRAY_LENGTH)
-                       return lzx_match_chooser_reverse_list(c, cur_pos);
-
-               /* Search for matches at repeat offsets.  Again, as a heuristic
-                * we only keep the longest one.  */
-               longest_rep_len = lzx_repsearch(&c->cur_window[c->match_window_pos],
-                                               c->match_window_end - c->match_window_pos,
-                                               &optimum[cur_pos].queue,
-                                               &longest_rep_slot);
-
-               /* If we found a long match at a repeat offset, choose it
-                * immediately.  */
-               if (longest_rep_len >= c->params.nice_match_length) {
-                       /* Build the list of matches to return and get
-                        * the first one.  */
-                       match = lzx_match_chooser_reverse_list(c, cur_pos);
-
-                       /* Append the long match to the end of the list.  */
-                       optimum[cur_pos].next.match_offset =
-                               optimum[cur_pos].queue.R[longest_rep_slot];
-                       optimum[cur_pos].next.link = cur_pos + longest_rep_len;
-                       c->optimum_end_idx = cur_pos + longest_rep_len;
-
-                       /* Skip over the remaining bytes of the long match.  */
-                       lzx_skip_bytes(c, longest_rep_len);
-
-                       /* Return first match in the list.  */
-                       return match;
-               }
 
 
-               /* Find other matches.  */
+               /* Find explicit offset matches with the current position.  */
                num_matches = lzx_get_matches(c, &matches);
 
                num_matches = lzx_get_matches(c, &matches);
 
-               /* If there's a long match, choose it immediately.  */
                if (num_matches) {
                if (num_matches) {
+                       /*
+                        * Find the longest repeat offset match with the current
+                        * position.
+                        *
+                        * Heuristics:
+                        *
+                        * - Only search for repeat offset matches if the
+                        *   match-finder already found at least one match.
+                        *
+                        * - Only consider the longest repeat offset match.  It
+                        *   seems to be rare for the optimal parse to include a
+                        *   repeat offset match that doesn't have the longest
+                        *   length (allowing for the possibility that not all
+                        *   of that length is actually used).
+                        */
+                       rep_max_len = lzx_repsearch(window_ptr,
+                                                   block_end - window_ptr,
+                                                   &cur_optimum_ptr->queue,
+                                                   &rep_max_idx);
+
+                       if (rep_max_len) {
+                               /* If there's a very long repeat offset match,
+                                * choose it immediately.  */
+                               if (rep_max_len >= c->params.nice_match_length) {
+
+                                       swap(cur_optimum_ptr->queue.R[0],
+                                            cur_optimum_ptr->queue.R[rep_max_idx]);
+                                       begin_queue = &cur_optimum_ptr->queue;
+
+                                       cur_optimum_ptr += rep_max_len;
+                                       cur_optimum_ptr->mc_item_data =
+                                               (rep_max_idx << MC_OFFSET_SHIFT) |
+                                               rep_max_len;
+
+                                       lzx_skip_bytes(c, rep_max_len - 1);
+                                       break;
+                               }
+
+                               /* If reaching any positions for the first time,
+                                * initialize their costs to "infinity".  */
+                               while (end_optimum_ptr < cur_optimum_ptr + rep_max_len)
+                                       (++end_optimum_ptr)->cost = MC_INFINITE_COST;
+
+                               /* Consider coding a repeat offset match.  */
+                               lzx_consider_repeat_offset_match(c,
+                                                                cur_optimum_ptr,
+                                                                rep_max_len,
+                                                                rep_max_idx);
+                       }
+
                        longest_len = matches[num_matches - 1].len;
                        longest_len = matches[num_matches - 1].len;
+
+                       /* If there's a very long explicit offset match, choose
+                        * it immediately.  */
                        if (longest_len >= c->params.nice_match_length) {
                        if (longest_len >= c->params.nice_match_length) {
-                               /* Build the list of matches to return and get
-                                * the first one.  */
-                               match = lzx_match_chooser_reverse_list(c, cur_pos);
 
 
-                               /* Append the long match to the end of the list.  */
-                               optimum[cur_pos].next.match_offset =
+                               cur_optimum_ptr->queue.R[2] =
+                                       cur_optimum_ptr->queue.R[1];
+                               cur_optimum_ptr->queue.R[1] =
+                                       cur_optimum_ptr->queue.R[0];
+                               cur_optimum_ptr->queue.R[0] =
                                        matches[num_matches - 1].offset;
                                        matches[num_matches - 1].offset;
-                               optimum[cur_pos].next.link = cur_pos + longest_len;
-                               c->optimum_end_idx = cur_pos + longest_len;
+                               begin_queue = &cur_optimum_ptr->queue;
 
 
-                               /* Skip over the remaining bytes of the long match.  */
-                               lzx_skip_bytes(c, longest_len - 1);
+                               offset_data = matches[num_matches - 1].offset +
+                                             LZX_OFFSET_OFFSET;
+                               cur_optimum_ptr += longest_len;
+                               cur_optimum_ptr->mc_item_data =
+                                       (offset_data << MC_OFFSET_SHIFT) |
+                                       longest_len;
 
 
-                               /* Return first match in the list.  */
-                               return match;
+                               lzx_skip_bytes(c, longest_len - 1);
+                               break;
                        }
                        }
+
+                       /* If reaching any positions for the first time,
+                        * initialize their costs to "infinity".  */
+                       while (end_optimum_ptr < cur_optimum_ptr + longest_len)
+                               (++end_optimum_ptr)->cost = MC_INFINITE_COST;
+
+                       /* Consider coding an explicit offset match.  */
+                       lzx_consider_explicit_offset_matches(c, cur_optimum_ptr,
+                                                            matches, num_matches);
                } else {
                } else {
-                       longest_len = 1;
+                       /* No matches found.  The only choice at this position
+                        * is to code a literal.  */
+
+                       if (end_optimum_ptr == cur_optimum_ptr) {
+                       #if 1
+                               /* Optimization for single literals.  */
+                               if (likely(cur_optimum_ptr == c->optimum)) {
+                                       lzx_declare_literal(c, *window_ptr++,
+                                                           next_chosen_item);
+                                       if (window_ptr == block_end) {
+                                               c->queue = cur_optimum_ptr->queue;
+                                               return;
+                                       }
+                                       continue;
+                               }
+                       #endif
+                               (++end_optimum_ptr)->cost = MC_INFINITE_COST;
+                       }
                }
 
                }
 
-               /* If we are reaching any positions for the first time, we need
-                * to initialize their costs to infinity.  */
-               while (end_pos < cur_pos + longest_len)
-                       optimum[++end_pos].cost = MC_INFINITE_COST;
-
-               /* Consider coding a literal.  */
-               cost = optimum[cur_pos].cost +
-                       lzx_literal_cost(c->cur_window[c->match_window_pos - 1],
-                                        &c->costs);
-               if (cost < optimum[cur_pos + 1].cost) {
-                       optimum[cur_pos + 1].queue = optimum[cur_pos].queue;
-                       optimum[cur_pos + 1].cost = cost;
-                       optimum[cur_pos + 1].prev.link = cur_pos;
-               }
+               /* Consider coding a literal.
 
 
-               /* Consider coding a match.
-                *
-                * The hard-coded cost calculation is done for the same reason
-                * stated in the comment for the similar loop earlier.
-                * Actually, it is *this* one that has the biggest effect on
-                * performance; overall LZX compression is > 10% faster with
-                * this code compared to calling lzx_match_cost() with each
-                * length.  */
-               for (unsigned i = 0, len = 2; i < num_matches; i++) {
-                       u32 offset;
-                       u32 position_cost;
-                       unsigned position_slot;
-                       unsigned num_extra_bits;
-
-                       offset = matches[i].offset;
-                       position_cost = optimum[cur_pos].cost;
-
-                       /* Yet another optimization: instead of calling
-                        * lzx_get_position_slot(), hand-inline the search of
-                        * the repeat offset queue.  Then we can omit the
-                        * extra_bits calculation for repeat offset matches, and
-                        * also only compute the updated queue if we actually do
-                        * find a new lowest cost path.  */
-                       for (position_slot = 0; position_slot < LZX_NUM_RECENT_OFFSETS; position_slot++)
-                               if (offset == optimum[cur_pos].queue.R[position_slot])
-                                       goto have_position_cost;
-
-                       position_slot = lzx_get_position_slot_raw(offset + LZX_OFFSET_OFFSET);
-
-                       num_extra_bits = lzx_get_num_extra_bits(position_slot);
-                       if (num_extra_bits >= 3) {
-                               position_cost += num_extra_bits - 3;
-                               position_cost += c->costs.aligned[
-                                               (offset + LZX_OFFSET_OFFSET) & 7];
-                       } else {
-                               position_cost += num_extra_bits;
-                       }
+                * To avoid an extra unpredictable brench, actually checking the
+                * preferability of coding a literal is integrated into the
+                * queue update code below.  */
+               literal = *window_ptr++;
+               cost = cur_optimum_ptr->cost + lzx_literal_cost(literal, &c->costs);
 
 
-               have_position_cost:
+               /* Advance to the next position.  */
+               cur_optimum_ptr++;
 
 
-                       do {
-                               u32 cost;
-                               unsigned len_header;
-                               unsigned main_symbol;
-
-                               cost = position_cost;
-
-                               if (len - LZX_MIN_MATCH_LEN < LZX_NUM_PRIMARY_LENS) {
-                                       len_header = len - LZX_MIN_MATCH_LEN;
-                               } else {
-                                       len_header = LZX_NUM_PRIMARY_LENS;
-                                       cost += c->costs.len[len -
-                                                       LZX_MIN_MATCH_LEN -
-                                                       LZX_NUM_PRIMARY_LENS];
-                               }
+               /* The lowest-cost path to the current position is now known.
+                * Finalize the recent offsets queue that results from taking
+                * this lowest-cost path.  */
 
 
-                               main_symbol = ((position_slot << 3) | len_header) +
-                                               LZX_NUM_CHARS;
-                               cost += c->costs.main[main_symbol];
-
-                               if (cost < optimum[cur_pos + len].cost) {
-                                       if (position_slot < LZX_NUM_RECENT_OFFSETS) {
-                                               optimum[cur_pos + len].queue = optimum[cur_pos].queue;
-                                               swap(optimum[cur_pos + len].queue.R[0],
-                                                    optimum[cur_pos + len].queue.R[position_slot]);
-                                       } else {
-                                               optimum[cur_pos + len].queue.R[0] = offset;
-                                               optimum[cur_pos + len].queue.R[1] = optimum[cur_pos].queue.R[0];
-                                               optimum[cur_pos + len].queue.R[2] = optimum[cur_pos].queue.R[1];
-                                       }
-                                       optimum[cur_pos + len].prev.link = cur_pos;
-                                       optimum[cur_pos + len].prev.match_offset = offset;
-                                       optimum[cur_pos + len].cost = cost;
-                               }
-                       } while (++len <= matches[i].len);
+               if (cost < cur_optimum_ptr->cost) {
+                       /* Literal: queue remains unchanged.  */
+                       cur_optimum_ptr->cost = cost;
+                       cur_optimum_ptr->mc_item_data = (literal << MC_OFFSET_SHIFT) | 1;
+                       cur_optimum_ptr->queue = (cur_optimum_ptr - 1)->queue;
+               } else {
+                       /* Match: queue update is needed.  */
+                       len = cur_optimum_ptr->mc_item_data & MC_LEN_MASK;
+                       offset_data = cur_optimum_ptr->mc_item_data >> MC_OFFSET_SHIFT;
+                       if (offset_data >= LZX_NUM_RECENT_OFFSETS) {
+                               /* Explicit offset match: offset is inserted at front  */
+                               cur_optimum_ptr->queue.R[0] = offset_data - LZX_OFFSET_OFFSET;
+                               cur_optimum_ptr->queue.R[1] = (cur_optimum_ptr - len)->queue.R[0];
+                               cur_optimum_ptr->queue.R[2] = (cur_optimum_ptr - len)->queue.R[1];
+                       } else {
+                               /* Repeat offset match: offset is swapped to front  */
+                               cur_optimum_ptr->queue = (cur_optimum_ptr - len)->queue;
+                               swap(cur_optimum_ptr->queue.R[0],
+                                    cur_optimum_ptr->queue.R[offset_data]);
+                       }
                }
 
                }
 
-               /* Consider coding a repeat offset match.
+               /*
+                * This loop will terminate when either of the following
+                * conditions is true:
+                *
+                * (1) cur_optimum_ptr == end_optimum_ptr
                 *
                 *
-                * As a heuristic, we only consider the longest length of the
-                * longest repeat offset match.  This does not, however,
-                * necessarily mean that we will never consider any other repeat
-                * offsets, because above we detect repeat offset matches that
-                * were found by the regular match-finder.  Therefore, this
-                * special handling of the longest repeat-offset match is only
-                * helpful for coding a repeat offset match that was *not* found
-                * by the match-finder, e.g. due to being obscured by a less
-                * distant match that is at least as long.
+                *      There are no paths that extend beyond the current
+                *      position.  In this case, any path to a later position
+                *      must pass through the current position, so we can go
+                *      ahead and choose the list of items that led to this
+                *      position.
                 *
                 *
-                * Note: an alternative, used in LZMA, is to consider every
-                * length of every repeat offset match.  This is a more thorough
-                * search, and it makes it unnecessary to detect repeat offset
-                * matches that were found by the regular match-finder.  But by
-                * my tests, for LZX the LZMA method slows down the compressor
-                * by ~10% and doesn't actually help the compression ratio too
-                * much.
+                * (2) cur_optimum_ptr == &c->optimum[LZX_OPTIM_ARRAY_LENGTH]
                 *
                 *
-                * Also tested a compromise approach: consider every 3rd length
-                * of the longest repeat offset match.  Still didn't seem quite
-                * worth it, though.
+                *      This bounds the number of times the algorithm can step
+                *      forward before it is guaranteed to start choosing items.
+                *      This limits the memory usage.  But
+                *      LZX_OPTIM_ARRAY_LENGTH is high enough that on most
+                *      inputs this limit is never reached.
+                *
+                * Note: no check for end-of-block is needed because
+                * end-of-block will trigger condition (1).
                 */
                 */
-               if (longest_rep_len) {
-
-                       LZX_ASSERT(longest_rep_len >= LZX_MIN_MATCH_LEN);
-
-                       while (end_pos < cur_pos + longest_rep_len)
-                               optimum[++end_pos].cost = MC_INFINITE_COST;
-
-                       cost = optimum[cur_pos].cost +
-                               lzx_repmatch_cost(longest_rep_len, longest_rep_slot,
-                                                 &c->costs);
-                       if (cost <= optimum[cur_pos + longest_rep_len].cost) {
-                               optimum[cur_pos + longest_rep_len].queue =
-                                       optimum[cur_pos].queue;
-                               swap(optimum[cur_pos + longest_rep_len].queue.R[0],
-                                    optimum[cur_pos + longest_rep_len].queue.R[longest_rep_slot]);
-                               optimum[cur_pos + longest_rep_len].prev.link =
-                                       cur_pos;
-                               optimum[cur_pos + longest_rep_len].prev.match_offset =
-                                       optimum[cur_pos + longest_rep_len].queue.R[0];
-                               optimum[cur_pos + longest_rep_len].cost =
-                                       cost;
-                       }
+               if (cur_optimum_ptr == end_optimum_ptr ||
+                   cur_optimum_ptr == &c->optimum[LZX_OPTIM_ARRAY_LENGTH])
+               {
+                       begin_queue = &cur_optimum_ptr->queue;
+                       break;
                }
        }
                }
        }
+
+       /* Choose the current list of items that constitute the minimum-cost
+        * path to the current position.  */
+       lzx_declare_item_list(c, cur_optimum_ptr, next_chosen_item);
+       goto begin;
 }
 
 }
 
-static struct lz_match
-lzx_choose_lazy_item(struct lzx_compressor *c)
+/* Fast heuristic scoring for lazy parsing: how "good" is this match?  */
+static inline unsigned
+lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
 {
 {
-       const struct lz_match *matches;
-       struct lz_match cur_match;
-       struct lz_match next_match;
-       u32 num_matches;
-
-       if (c->prev_match.len) {
-               cur_match = c->prev_match;
-               c->prev_match.len = 0;
-       } else {
-               num_matches = lzx_get_matches(c, &matches);
-               if (num_matches == 0 ||
-                   (matches[num_matches - 1].len <= 3 &&
-                    (matches[num_matches - 1].len <= 2 ||
-                     matches[num_matches - 1].offset > 4096)))
-               {
-                       return (struct lz_match) { };
-               }
-
-               cur_match = matches[num_matches - 1];
-       }
-
-       if (cur_match.len >= c->params.nice_match_length) {
-               lzx_skip_bytes(c, cur_match.len - 1);
-               return cur_match;
-       }
+       unsigned score = len;
 
 
-       num_matches = lzx_get_matches(c, &matches);
-       if (num_matches == 0 ||
-           (matches[num_matches - 1].len <= 3 &&
-            (matches[num_matches - 1].len <= 2 ||
-             matches[num_matches - 1].offset > 4096)))
-       {
-               lzx_skip_bytes(c, cur_match.len - 2);
-               return cur_match;
-       }
+       if (adjusted_offset < 2048)
+               score++;
 
 
-       next_match = matches[num_matches - 1];
+       if (adjusted_offset < 1024)
+               score++;
 
 
-       if (next_match.len <= cur_match.len) {
-               lzx_skip_bytes(c, cur_match.len - 2);
-               return cur_match;
-       } else {
-               c->prev_match = next_match;
-               return (struct lz_match) { };
-       }
+       return score;
 }
 
 }
 
-/*
- * Return the next match or literal to use, delegating to the currently selected
- * match-choosing algorithm.
- *
- * If the length of the returned 'struct lz_match' is less than
- * LZX_MIN_MATCH_LEN, then it is really a literal.
- */
-static inline struct lz_match
-lzx_choose_item(struct lzx_compressor *c)
+static inline unsigned
+lzx_repeat_offset_match_score(unsigned len, unsigned slot)
 {
 {
-       return (*c->params.choose_item_func)(c);
+       return len + 3;
 }
 
 }
 
-/* Set default symbol costs for the LZX Huffman codes.  */
-static void
-lzx_set_default_costs(struct lzx_costs * costs, unsigned num_main_syms)
+/* Lazy parsing  */
+static u32
+lzx_choose_lazy_items_for_block(struct lzx_compressor *c,
+                               u32 block_start_pos, u32 block_size)
 {
 {
-       unsigned i;
+       const u8 *window_ptr;
+       const u8 *block_end;
+       struct lz_mf *mf;
+       struct lz_match *matches;
+       unsigned num_matches;
+       unsigned cur_len;
+       u32 cur_offset_data;
+       unsigned cur_score;
+       unsigned rep_max_len;
+       unsigned rep_max_idx;
+       unsigned rep_score;
+       unsigned prev_len;
+       unsigned prev_score;
+       u32 prev_offset_data;
+       unsigned skip_len;
+       struct lzx_item *next_chosen_item;
 
 
-       /* Main code (part 1): Literal symbols  */
-       for (i = 0; i < LZX_NUM_CHARS; i++)
-               costs->main[i] = 8;
+       window_ptr = &c->cur_window[block_start_pos];
+       block_end = window_ptr + block_size;
+       matches = c->cached_matches;
+       mf = c->mf;
+       next_chosen_item = c->chosen_items;
 
 
-       /* Main code (part 2): Match header symbols  */
-       for (; i < num_main_syms; i++)
-               costs->main[i] = 10;
+       prev_len = 0;
+       prev_offset_data = 0;
+       prev_score = 0;
 
 
-       /* Length code  */
-       for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
-               costs->len[i] = 8;
+       while (window_ptr != block_end) {
 
 
-       /* Aligned offset code  */
-       for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
-               costs->aligned[i] = 3;
+               /* Find explicit offset matches with the current position.  */
+               num_matches = lz_mf_get_matches(mf, matches);
+               window_ptr++;
+
+               if (num_matches == 0 ||
+                   (matches[num_matches - 1].len == 3 &&
+                    matches[num_matches - 1].offset >= 8192 - LZX_OFFSET_OFFSET &&
+                    matches[num_matches - 1].offset != c->queue.R[0] &&
+                    matches[num_matches - 1].offset != c->queue.R[1] &&
+                    matches[num_matches - 1].offset != c->queue.R[2]))
+               {
+                       /* No match found, or the only match found was a distant
+                        * length 3 match.  Output the previous match if there
+                        * is one; otherwise output a literal.  */
+
+                       if (prev_len) {
+                               skip_len = prev_len - 2;
+                               goto output_prev_match;
+                       } else {
+                               lzx_declare_literal(c, *(window_ptr - 1),
+                                                   &next_chosen_item);
+                               continue;
+                       }
+               }
+
+               /* Find the longest repeat offset match with the current
+                * position.  */
+               if (likely(block_end - (window_ptr - 1) >= 2)) {
+                       rep_max_len = lzx_repsearch((window_ptr - 1),
+                                                   block_end - (window_ptr - 1),
+                                                   &c->queue, &rep_max_idx);
+               } else {
+                       rep_max_len = 0;
+               }
+
+               cur_len = matches[num_matches - 1].len;
+               cur_offset_data = matches[num_matches - 1].offset + LZX_OFFSET_OFFSET;
+               cur_score = lzx_explicit_offset_match_score(cur_len, cur_offset_data);
+
+               /* Select the better of the explicit and repeat offset matches.  */
+               if (rep_max_len >= 3 &&
+                   (rep_score = lzx_repeat_offset_match_score(rep_max_len,
+                                                              rep_max_idx)) >= cur_score)
+               {
+                       cur_len = rep_max_len;
+                       cur_offset_data = rep_max_idx;
+                       cur_score = rep_score;
+               }
+
+               if (unlikely(cur_len > block_end - (window_ptr - 1))) {
+                       /* Nearing end of block.  */
+                       cur_len = block_end - (window_ptr - 1);
+                       if (cur_len < 3) {
+                               lzx_declare_literal(c, *(window_ptr - 1), &next_chosen_item);
+                               prev_len = 0;
+                               continue;
+                       }
+               }
+
+               if (prev_len == 0 || cur_score > prev_score) {
+                       /* No previous match, or the current match is better
+                        * than the previous match.
+                        *
+                        * If there's a previous match, then output a literal in
+                        * its place.
+                        *
+                        * In both cases, if the current match is very long,
+                        * then output it immediately.  Otherwise, attempt a
+                        * lazy match by waiting to see if there's a better
+                        * match at the next position.  */
+
+                       if (prev_len)
+                               lzx_declare_literal(c, *(window_ptr - 2), &next_chosen_item);
+
+                       prev_len = cur_len;
+                       prev_offset_data = cur_offset_data;
+                       prev_score = cur_score;
+
+                       if (prev_len >= c->params.nice_match_length) {
+                               skip_len = prev_len - 1;
+                               goto output_prev_match;
+                       }
+                       continue;
+               }
+
+               /* Current match is not better than the previous match, so
+                * output the previous match.  */
+
+               skip_len = prev_len - 2;
+
+       output_prev_match:
+               if (prev_offset_data < LZX_NUM_RECENT_OFFSETS) {
+                       lzx_declare_repeat_offset_match(c, prev_len,
+                                                       prev_offset_data,
+                                                       &next_chosen_item);
+                       swap(c->queue.R[0], c->queue.R[prev_offset_data]);
+               } else {
+                       lzx_declare_explicit_offset_match(c, prev_len,
+                                                         prev_offset_data - LZX_OFFSET_OFFSET,
+                                                         &next_chosen_item);
+                       c->queue.R[2] = c->queue.R[1];
+                       c->queue.R[1] = c->queue.R[0];
+                       c->queue.R[0] = prev_offset_data - LZX_OFFSET_OFFSET;
+               }
+               lz_mf_skip_positions(mf, skip_len);
+               window_ptr += skip_len;
+               prev_len = 0;
+       }
+
+       return next_chosen_item - c->chosen_items;
 }
 
 /* Given the frequencies of symbols in an LZX-compressed block and the
 }
 
 /* Given the frequencies of symbols in an LZX-compressed block and the
@@ -2014,204 +1896,229 @@ lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
        unsigned aligned_cost = 0;
        unsigned verbatim_cost = 0;
 
        unsigned aligned_cost = 0;
        unsigned verbatim_cost = 0;
 
-       /* Verbatim blocks have a constant 3 bits per position footer.  Aligned
-        * offset blocks have an aligned offset symbol per position footer, plus
-        * an extra 24 bits per block to output the lengths necessary to
-        * reconstruct the aligned offset code itself.  */
+       /* A verbatim block require 3 bits in each place that an aligned symbol
+        * was used.  */
        for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
                verbatim_cost += 3 * freqs->aligned[i];
                aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
        }
        for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
                verbatim_cost += 3 * freqs->aligned[i];
                aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
        }
+
+       /* Account for output of the aligned offset code.  */
        aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE * LZX_ALIGNEDCODE_NUM_SYMBOLS;
        aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE * LZX_ALIGNEDCODE_NUM_SYMBOLS;
+
        if (aligned_cost < verbatim_cost)
                return LZX_BLOCKTYPE_ALIGNED;
        else
                return LZX_BLOCKTYPE_VERBATIM;
 }
 
        if (aligned_cost < verbatim_cost)
                return LZX_BLOCKTYPE_ALIGNED;
        else
                return LZX_BLOCKTYPE_VERBATIM;
 }
 
-/* Find a sequence of matches/literals with which to output the specified LZX
- * block, then set the block's type to that which has the minimum cost to output
- * (either verbatim or aligned).  */
-static void
-lzx_choose_items_for_block(struct lzx_compressor *c, struct lzx_block_spec *spec)
+/* Near-optimal parsing  */
+static u32
+lzx_choose_near_optimal_items_for_block(struct lzx_compressor *c,
+                                       u32 block_start_pos, u32 block_size)
 {
 {
-       const struct lzx_lru_queue orig_queue = c->queue;
        u32 num_passes_remaining = c->params.num_optim_passes;
        u32 num_passes_remaining = c->params.num_optim_passes;
-       struct lzx_freqs freqs;
-       const u8 *window_ptr;
-       const u8 *window_end;
+       struct lzx_lru_queue orig_queue;
        struct lzx_item *next_chosen_item;
        struct lzx_item *next_chosen_item;
-       struct lz_match lz_match;
-       struct lzx_item lzx_item;
-
-       LZX_ASSERT(num_passes_remaining >= 1);
-       LZX_ASSERT(lz_mf_get_position(c->mf) == spec->window_pos);
-
-       c->match_window_end = spec->window_pos + spec->block_size;
+       struct lzx_item **next_chosen_item_ptr;
 
 
-       if (c->params.num_optim_passes > 1) {
-               if (spec->block_size == c->cur_window_size)
+       /* Choose appropriate match-finder wrapper functions.  */
+       if (num_passes_remaining > 1) {
+               if (block_size == c->cur_window_size)
                        c->get_matches_func = lzx_get_matches_fillcache_singleblock;
                else
                        c->get_matches_func = lzx_get_matches_fillcache_multiblock;
                c->skip_bytes_func = lzx_skip_bytes_fillcache;
        } else {
                        c->get_matches_func = lzx_get_matches_fillcache_singleblock;
                else
                        c->get_matches_func = lzx_get_matches_fillcache_multiblock;
                c->skip_bytes_func = lzx_skip_bytes_fillcache;
        } else {
-               if (spec->block_size == c->cur_window_size)
+               if (block_size == c->cur_window_size)
                        c->get_matches_func = lzx_get_matches_nocache_singleblock;
                else
                        c->get_matches_func = lzx_get_matches_nocache_multiblock;
                c->skip_bytes_func = lzx_skip_bytes_nocache;
        }
 
                        c->get_matches_func = lzx_get_matches_nocache_singleblock;
                else
                        c->get_matches_func = lzx_get_matches_nocache_multiblock;
                c->skip_bytes_func = lzx_skip_bytes_nocache;
        }
 
-       /* The first optimal parsing pass is done using the cost model already
-        * set in c->costs.  Each later pass is done using a cost model
-        * computed from the previous pass.
+       /* No matches will extend beyond the end of the block.  */
+       c->match_window_end = block_start_pos + block_size;
+
+       /* The first optimization pass will use a default cost model.  Each
+        * additional optimization pass will use a cost model computed from the
+        * previous pass.
         *
         * To improve performance we only generate the array containing the
         *
         * To improve performance we only generate the array containing the
-        * matches and literals in intermediate form on the final pass.  */
+        * matches and literals in intermediate form on the final pass.  For
+        * earlier passes, tallying symbol frequencies is sufficient.  */
+       lzx_set_default_costs(&c->costs, c->num_main_syms);
 
 
-       while (--num_passes_remaining) {
-               c->match_window_pos = spec->window_pos;
+       next_chosen_item_ptr = NULL;
+       orig_queue = c->queue;
+       do {
+               /* Reset the match-finder wrapper.  */
+               c->match_window_pos = block_start_pos;
                c->cache_ptr = c->cached_matches;
                c->cache_ptr = c->cached_matches;
-               memset(&freqs, 0, sizeof(freqs));
-               window_ptr = &c->cur_window[spec->window_pos];
-               window_end = window_ptr + spec->block_size;
 
 
-               while (window_ptr != window_end) {
+               if (num_passes_remaining == 1) {
+                       /* Last pass: actually generate the items.  */
+                       next_chosen_item = c->chosen_items;
+                       next_chosen_item_ptr = &next_chosen_item;
+               }
 
 
-                       lz_match = lzx_choose_item(c);
+               /* Choose the items.  */
+               lzx_optim_pass(c, next_chosen_item_ptr);
 
 
-                       LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
-                                    lz_match.offset == c->max_window_size -
-                                                        LZX_MIN_MATCH_LEN));
-                       if (lz_match.len >= LZX_MIN_MATCH_LEN) {
-                               lzx_tally_match(lz_match.len, lz_match.offset,
-                                               &freqs, &c->queue);
-                               window_ptr += lz_match.len;
-                       } else {
-                               lzx_tally_literal(*window_ptr, &freqs);
-                               window_ptr += 1;
-                       }
-               }
-               lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
-               lzx_set_costs(c, &spec->codes.lens, 15);
-               c->queue = orig_queue;
-               if (c->cache_ptr <= c->cache_limit) {
-                       c->get_matches_func = lzx_get_matches_usecache_nocheck;
-                       c->skip_bytes_func = lzx_skip_bytes_usecache_nocheck;
-               } else {
-                       c->get_matches_func = lzx_get_matches_usecache;
-                       c->skip_bytes_func = lzx_skip_bytes_usecache;
-               }
-       }
+               if (num_passes_remaining > 1) {
+                       /* This isn't the last pass.  */
 
 
-       c->match_window_pos = spec->window_pos;
-       c->cache_ptr = c->cached_matches;
-       memset(&freqs, 0, sizeof(freqs));
-       window_ptr = &c->cur_window[spec->window_pos];
-       window_end = window_ptr + spec->block_size;
+                       /* Make the Huffman codes from the symbol frequencies.  */
+                       lzx_make_huffman_codes(&c->freqs, &c->codes[c->codes_index],
+                                              c->num_main_syms);
 
 
-       spec->chosen_items = &c->chosen_items[spec->window_pos];
-       next_chosen_item = spec->chosen_items;
+                       /* Update symbol costs.  */
+                       lzx_set_costs(c, &c->codes[c->codes_index].lens);
 
 
-       unsigned unseen_cost = 9;
-       while (window_ptr != window_end) {
+                       /* Reset symbol frequencies.  */
+                       memset(&c->freqs, 0, sizeof(c->freqs));
 
 
-               lz_match = lzx_choose_item(c);
+                       /* Reset the match offset LRU queue to what it was at
+                        * the beginning of the block.  */
+                       c->queue = orig_queue;
 
 
-               LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
-                            lz_match.offset == c->max_window_size -
-                                                LZX_MIN_MATCH_LEN));
-               if (lz_match.len >= LZX_MIN_MATCH_LEN) {
-                       lzx_item.data = lzx_tally_match(lz_match.len,
-                                                        lz_match.offset,
-                                                        &freqs, &c->queue);
-                       window_ptr += lz_match.len;
-               } else {
-                       lzx_item.data = lzx_tally_literal(*window_ptr, &freqs);
-                       window_ptr += 1;
+                       /* Choose appopriate match-finder wrapper functions.  */
+                       if (c->cache_ptr <= c->cache_limit) {
+                               c->get_matches_func = lzx_get_matches_usecache_nocheck;
+                               c->skip_bytes_func = lzx_skip_bytes_usecache_nocheck;
+                       } else {
+                               c->get_matches_func = lzx_get_matches_usecache;
+                               c->skip_bytes_func = lzx_skip_bytes_usecache;
+                       }
                }
                }
-               *next_chosen_item++ = lzx_item;
+       } while (--num_passes_remaining);
 
 
-               /* When doing one-pass "near-optimal" parsing, update the cost
-                * model occassionally.  */
-               if (unlikely((next_chosen_item - spec->chosen_items) % 2048 == 0) &&
-                   c->params.choose_item_func == lzx_choose_near_optimal_item &&
-                   c->params.num_optim_passes == 1)
-               {
-                       lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
-                       lzx_set_costs(c, &spec->codes.lens, unseen_cost);
-                       if (unseen_cost < 15)
-                               unseen_cost++;
-               }
-       }
-       spec->num_chosen_items = next_chosen_item - spec->chosen_items;
-       lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
-       spec->block_type = lzx_choose_verbatim_or_aligned(&freqs, &spec->codes);
+       /* Return the number of items chosen.  */
+       return next_chosen_item - c->chosen_items;
+}
+
+/*
+ * Choose the matches/literals with which to output the block of data beginning
+ * at '&c->cur_window[block_start_pos]' and extending for 'block_size' bytes.
+ *
+ * The frequences of the Huffman symbols in the block will be tallied in
+ * 'c->freqs'.
+ *
+ * 'c->queue' must specify the state of the queue at the beginning of this block.
+ * This function will update it to the state of the queue at the end of this
+ * block.
+ *
+ * Returns the number of matches/literals that were chosen and written to
+ * 'c->chosen_items' in the 'struct lzx_item' intermediate representation.
+ */
+static u32
+lzx_choose_items_for_block(struct lzx_compressor *c,
+                          u32 block_start_pos, u32 block_size)
+{
+       return (*c->params.choose_items_for_block)(c, block_start_pos, block_size);
 }
 
 }
 
-/* Prepare the input window into one or more LZX blocks ready to be output.  */
+/* Initialize c->offset_slot_fast.  */
 static void
 static void
-lzx_prepare_blocks(struct lzx_compressor *c)
+lzx_init_offset_slot_fast(struct lzx_compressor *c)
 {
 {
-       /* Set up a default cost model.  */
-       if (c->params.choose_item_func == lzx_choose_near_optimal_item)
-               lzx_set_default_costs(&c->costs, c->num_main_syms);
+       u8 slot = 0;
 
 
-       /* Set up the block specifications.
-        * TODO: The compression ratio could be slightly improved by performing
-        * data-dependent block splitting instead of using fixed-size blocks.
-        * Doing so well is a computationally hard problem, however.  */
-       c->num_blocks = DIV_ROUND_UP(c->cur_window_size, LZX_DIV_BLOCK_SIZE);
-       for (unsigned i = 0; i < c->num_blocks; i++) {
-               u32 pos = LZX_DIV_BLOCK_SIZE * i;
-               c->block_specs[i].window_pos = pos;
-               c->block_specs[i].block_size = min(c->cur_window_size - pos,
-                                                  LZX_DIV_BLOCK_SIZE);
-       }
+       for (u32 offset = 0; offset < LZX_NUM_FAST_OFFSETS; offset++) {
 
 
-       /* Load the window into the match-finder.  */
-       lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size);
+               while (offset + LZX_OFFSET_OFFSET >= lzx_offset_slot_base[slot + 1])
+                       slot++;
 
 
-       /* Determine sequence of matches/literals to output for each block.  */
-       lzx_lru_queue_init(&c->queue);
-       c->optimum_cur_idx = 0;
-       c->optimum_end_idx = 0;
-       c->prev_match.len = 0;
-       for (unsigned i = 0; i < c->num_blocks; i++)
-               lzx_choose_items_for_block(c, &c->block_specs[i]);
+               c->offset_slot_fast[offset] = slot;
+       }
 }
 
 }
 
+/* Set internal compression parameters for the specified compression level and
+ * maximum window size.  */
 static void
 static void
-lzx_build_params(unsigned int compression_level,
-                u32 max_window_size,
+lzx_build_params(unsigned int compression_level, u32 max_window_size,
                 struct lzx_compressor_params *lzx_params)
 {
        if (compression_level < 25) {
                 struct lzx_compressor_params *lzx_params)
 {
        if (compression_level < 25) {
-               lzx_params->choose_item_func = lzx_choose_lazy_item;
-               lzx_params->num_optim_passes  = 1;
+
+               /* Fast compression: Use lazy parsing.  */
+
+               lzx_params->choose_items_for_block = lzx_choose_lazy_items_for_block;
+               lzx_params->num_optim_passes = 1;
+
+               /* When lazy parsing, the hash chain match-finding algorithm is
+                * fastest unless the window is too large.
+                *
+                * TODO: something like hash arrays would actually be better
+                * than binary trees on large windows.  */
                if (max_window_size <= 262144)
                        lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
                else
                        lzx_params->mf_algo = LZ_MF_BINARY_TREES;
                if (max_window_size <= 262144)
                        lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
                else
                        lzx_params->mf_algo = LZ_MF_BINARY_TREES;
-               lzx_params->min_match_length  = 3;
+
+               /* When lazy parsing, don't bother with length 2 matches.  */
+               lzx_params->min_match_length = 3;
+
+               /* Scale nice_match_length and max_search_depth with the
+                * compression level.  */
                lzx_params->nice_match_length = 25 + compression_level * 2;
                lzx_params->nice_match_length = 25 + compression_level * 2;
-               lzx_params->max_search_depth  = 25 + compression_level;
+               lzx_params->max_search_depth = 25 + compression_level;
        } else {
        } else {
-               lzx_params->choose_item_func = lzx_choose_near_optimal_item;
-               lzx_params->num_optim_passes  = compression_level / 20;
+
+               /* Normal / high compression: Use near-optimal parsing.  */
+
+               lzx_params->choose_items_for_block = lzx_choose_near_optimal_items_for_block;
+
+               /* Set a number of optimization passes appropriate for the
+                * compression level.  */
+
+               lzx_params->num_optim_passes = 1;
+
+               if (compression_level >= 40)
+                       lzx_params->num_optim_passes++;
+
+               /* Use more optimization passes for higher compression levels.
+                * But the more passes there are, the less they help --- so
+                * don't add them linearly.  */
+               if (compression_level >= 70) {
+                       lzx_params->num_optim_passes++;
+                       if (compression_level >= 100)
+                               lzx_params->num_optim_passes++;
+                       if (compression_level >= 150)
+                               lzx_params->num_optim_passes++;
+                       if (compression_level >= 200)
+                               lzx_params->num_optim_passes++;
+                       if (compression_level >= 300)
+                               lzx_params->num_optim_passes++;
+               }
+
+               /* When doing near-optimal parsing, the hash chain match-finding
+                * algorithm is good if the window size is small and we're only
+                * doing one optimization pass.  Otherwise, the binary tree
+                * algorithm is the way to go.  */
                if (max_window_size <= 32768 && lzx_params->num_optim_passes == 1)
                        lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
                else
                        lzx_params->mf_algo = LZ_MF_BINARY_TREES;
                if (max_window_size <= 32768 && lzx_params->num_optim_passes == 1)
                        lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
                else
                        lzx_params->mf_algo = LZ_MF_BINARY_TREES;
-               lzx_params->min_match_length  = (compression_level >= 45) ? 2 : 3;
+
+               /* When doing near-optimal parsing, allow length 2 matches if
+                * the compression level is sufficiently high.  */
+               if (compression_level >= 45)
+                       lzx_params->min_match_length = 2;
+               else
+                       lzx_params->min_match_length = 3;
+
+               /* Scale nice_match_length and max_search_depth with the
+                * compression level.  */
                lzx_params->nice_match_length = min(((u64)compression_level * 32) / 50,
                                                    LZX_MAX_MATCH_LEN);
                lzx_params->nice_match_length = min(((u64)compression_level * 32) / 50,
                                                    LZX_MAX_MATCH_LEN);
-               lzx_params->max_search_depth  = min(((u64)compression_level * 50) / 50,
-                                                   LZX_MAX_MATCH_LEN);
+               lzx_params->max_search_depth = min(((u64)compression_level * 50) / 50,
+                                                  LZX_MAX_MATCH_LEN);
        }
 }
 
        }
 }
 
+/* Given the internal compression parameters and maximum window size, build the
+ * Lempel-Ziv match-finder parameters.  */
 static void
 lzx_build_mf_params(const struct lzx_compressor_params *lzx_params,
                    u32 max_window_size, struct lz_mf_params *mf_params)
 static void
 lzx_build_mf_params(const struct lzx_compressor_params *lzx_params,
                    u32 max_window_size, struct lz_mf_params *mf_params)
@@ -2246,18 +2153,13 @@ lzx_get_needed_memory(size_t max_block_size, unsigned int compression_level)
 
        size += sizeof(struct lzx_compressor);
 
 
        size += sizeof(struct lzx_compressor);
 
+       /* cur_window */
        size += max_window_size;
 
        size += max_window_size;
 
-       size += DIV_ROUND_UP(max_window_size, LZX_DIV_BLOCK_SIZE) *
-               sizeof(struct lzx_block_spec);
-
-       size += max_window_size * sizeof(struct lzx_item);
-
+       /* mf */
        size += lz_mf_get_needed_memory(params.mf_algo, max_window_size);
        size += lz_mf_get_needed_memory(params.mf_algo, max_window_size);
-       if (params.choose_item_func == lzx_choose_near_optimal_item) {
-               size += (LZX_OPTIM_ARRAY_LENGTH + params.nice_match_length) *
-                       sizeof(struct lzx_mc_pos_data);
-       }
+
+       /* cached_matches */
        if (params.num_optim_passes > 1)
                size += LZX_CACHE_LEN * sizeof(struct lz_match);
        else
        if (params.num_optim_passes > 1)
                size += LZX_CACHE_LEN * sizeof(struct lz_match);
        else
@@ -2291,35 +2193,18 @@ lzx_create_compressor(size_t max_block_size, unsigned int compression_level,
 
        c->params = params;
        c->num_main_syms = lzx_get_num_main_syms(window_order);
 
        c->params = params;
        c->num_main_syms = lzx_get_num_main_syms(window_order);
-       c->max_window_size = max_window_size;
        c->window_order = window_order;
 
        c->window_order = window_order;
 
+       /* The window is allocated as 16-byte aligned to speed up memcpy() and
+        * enable lzx_e8_filter() optimization on x86_64.  */
        c->cur_window = ALIGNED_MALLOC(max_window_size, 16);
        if (!c->cur_window)
                goto oom;
 
        c->cur_window = ALIGNED_MALLOC(max_window_size, 16);
        if (!c->cur_window)
                goto oom;
 
-       c->block_specs = MALLOC(DIV_ROUND_UP(max_window_size,
-                                            LZX_DIV_BLOCK_SIZE) *
-                               sizeof(struct lzx_block_spec));
-       if (!c->block_specs)
-               goto oom;
-
-       c->chosen_items = MALLOC(max_window_size * sizeof(struct lzx_item));
-       if (!c->chosen_items)
-               goto oom;
-
        c->mf = lz_mf_alloc(&mf_params);
        if (!c->mf)
                goto oom;
 
        c->mf = lz_mf_alloc(&mf_params);
        if (!c->mf)
                goto oom;
 
-       if (params.choose_item_func == lzx_choose_near_optimal_item) {
-               c->optimum = MALLOC((LZX_OPTIM_ARRAY_LENGTH +
-                                    params.nice_match_length) *
-                                   sizeof(struct lzx_mc_pos_data));
-               if (!c->optimum)
-                       goto oom;
-       }
-
        if (params.num_optim_passes > 1) {
                c->cached_matches = MALLOC(LZX_CACHE_LEN *
                                           sizeof(struct lz_match));
        if (params.num_optim_passes > 1) {
                c->cached_matches = MALLOC(LZX_CACHE_LEN *
                                           sizeof(struct lz_match));
@@ -2334,6 +2219,8 @@ lzx_create_compressor(size_t max_block_size, unsigned int compression_level,
                        goto oom;
        }
 
                        goto oom;
        }
 
+       lzx_init_offset_slot_fast(c);
+
        *c_ret = c;
        return 0;
 
        *c_ret = c;
        return 0;
 
@@ -2348,26 +2235,83 @@ lzx_compress(const void *uncompressed_data, size_t uncompressed_size,
 {
        struct lzx_compressor *c = _c;
        struct lzx_output_bitstream os;
 {
        struct lzx_compressor *c = _c;
        struct lzx_output_bitstream os;
+       u32 num_chosen_items;
+       const struct lzx_lens *prev_lens;
+       u32 block_start_pos;
+       u32 block_size;
+       int block_type;
 
        /* Don't bother compressing very small inputs.  */
        if (uncompressed_size < 100)
                return 0;
 
        /* The input data must be preprocessed.  To avoid changing the original
 
        /* Don't bother compressing very small inputs.  */
        if (uncompressed_size < 100)
                return 0;
 
        /* The input data must be preprocessed.  To avoid changing the original
-        * input, copy it to a temporary buffer.  */
+        * input data, copy it to a temporary buffer.  */
        memcpy(c->cur_window, uncompressed_data, uncompressed_size);
        c->cur_window_size = uncompressed_size;
 
        /* Preprocess the data.  */
        lzx_do_e8_preprocessing(c->cur_window, c->cur_window_size);
 
        memcpy(c->cur_window, uncompressed_data, uncompressed_size);
        c->cur_window_size = uncompressed_size;
 
        /* Preprocess the data.  */
        lzx_do_e8_preprocessing(c->cur_window, c->cur_window_size);
 
-       /* Prepare the compressed data.  */
-       lzx_prepare_blocks(c);
+       /* Load the window into the match-finder.  */
+       lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size);
+
+       /* Initialize the match offset LRU queue.  */
+       lzx_lru_queue_init(&c->queue);
 
 
-       /* Generate the compressed data and return its size, or 0 if an overflow
-        * occurred.  */
+       /* Initialize the output bitstream.  */
        lzx_init_output(&os, compressed_data, compressed_size_avail);
        lzx_init_output(&os, compressed_data, compressed_size_avail);
-       lzx_write_all_blocks(c, &os);
+
+       /* Compress the data block by block.
+        *
+        * TODO: The compression ratio could be slightly improved by performing
+        * data-dependent block splitting instead of using fixed-size blocks.
+        * Doing so well is a computationally hard problem, however.  */
+       block_start_pos = 0;
+       c->codes_index = 0;
+       prev_lens = &c->zero_lens;
+       do {
+               /* Compute the block size.  */
+               block_size = min(LZX_DIV_BLOCK_SIZE,
+                                uncompressed_size - block_start_pos);
+
+               /* Reset symbol frequencies.  */
+               memset(&c->freqs, 0, sizeof(c->freqs));
+
+               /* Prepare the matches/literals for the block.  */
+               num_chosen_items = lzx_choose_items_for_block(c,
+                                                             block_start_pos,
+                                                             block_size);
+
+               /* Make the Huffman codes from the symbol frequencies.  */
+               lzx_make_huffman_codes(&c->freqs, &c->codes[c->codes_index],
+                                      c->num_main_syms);
+
+               /* Choose the best block type.
+                *
+                * Note: we currently don't consider uncompressed blocks.  */
+               block_type = lzx_choose_verbatim_or_aligned(&c->freqs,
+                                                           &c->codes[c->codes_index]);
+
+               /* Write the compressed block to the output buffer.  */
+               lzx_write_compressed_block(block_type,
+                                          block_size,
+                                          c->window_order,
+                                          c->num_main_syms,
+                                          c->chosen_items,
+                                          num_chosen_items,
+                                          &c->codes[c->codes_index],
+                                          prev_lens,
+                                          &os);
+
+               /* The current codeword lengths become the previous lengths.  */
+               prev_lens = &c->codes[c->codes_index].lens;
+               c->codes_index ^= 1;
+
+               block_start_pos += block_size;
+
+       } while (block_start_pos != uncompressed_size);
+
        return lzx_flush_output(&os);
 }
 
        return lzx_flush_output(&os);
 }
 
@@ -2378,10 +2322,7 @@ lzx_free_compressor(void *_c)
 
        if (c) {
                ALIGNED_FREE(c->cur_window);
 
        if (c) {
                ALIGNED_FREE(c->cur_window);
-               FREE(c->block_specs);
-               FREE(c->chosen_items);
                lz_mf_free(c->mf);
                lz_mf_free(c->mf);
-               FREE(c->optimum);
                FREE(c->cached_matches);
                FREE(c);
        }
                FREE(c->cached_matches);
                FREE(c);
        }
index 58d1f6b1e75fdcca283a0863534462312a52e44e..411a3a0436fd99e2bb05818c10a677b800df2e32 100644 (file)
@@ -427,7 +427,7 @@ lzx_decompress_block(int block_type, u32 block_size,
        u8 *window_end = window_ptr + block_size;
        unsigned mainsym;
        u32 match_len;
        u8 *window_end = window_ptr + block_size;
        unsigned mainsym;
        u32 match_len;
-       unsigned position_slot;
+       unsigned offset_slot;
        u32 match_offset;
        unsigned num_extra_bits;
        unsigned ones_if_aligned = 0U - (block_type == LZX_BLOCKTYPE_ALIGNED);
        u32 match_offset;
        unsigned num_extra_bits;
        unsigned ones_if_aligned = 0U - (block_type == LZX_BLOCKTYPE_ALIGNED);
@@ -443,35 +443,35 @@ lzx_decompress_block(int block_type, u32 block_size,
 
                /* Match  */
 
 
                /* Match  */
 
-               /* Decode the length header and position slot.  */
+               /* Decode the length header and offset slot.  */
                mainsym -= LZX_NUM_CHARS;
                match_len = mainsym & 0x7;
                mainsym -= LZX_NUM_CHARS;
                match_len = mainsym & 0x7;
-               position_slot = mainsym >> 3;
+               offset_slot = mainsym >> 3;
 
                /* If needed, read a length symbol to decode the full length. */
                if (match_len == 0x7)
                        match_len += read_huffsym_using_lencode(istream, tables);
                match_len += LZX_MIN_MATCH_LEN;
 
 
                /* If needed, read a length symbol to decode the full length. */
                if (match_len == 0x7)
                        match_len += read_huffsym_using_lencode(istream, tables);
                match_len += LZX_MIN_MATCH_LEN;
 
-               if (position_slot <= 2) {
+               if (offset_slot <= 2) {
                        /* Repeat offset  */
 
                        /* Note: This isn't a real LRU queue, since using the R2
                         * offset doesn't bump the R1 offset down to R2.  This
                         * quirk allows all 3 recent offsets to be handled by
                         * the same code.  (For R0, the swap is a no-op.)  */
                        /* Repeat offset  */
 
                        /* Note: This isn't a real LRU queue, since using the R2
                         * offset doesn't bump the R1 offset down to R2.  This
                         * quirk allows all 3 recent offsets to be handled by
                         * the same code.  (For R0, the swap is a no-op.)  */
-                       match_offset = queue->R[position_slot];
-                       queue->R[position_slot] = queue->R[0];
+                       match_offset = queue->R[offset_slot];
+                       queue->R[offset_slot] = queue->R[0];
                        queue->R[0] = match_offset;
                } else {
                        /* Explicit offset  */
 
                        /* Look up the number of extra bits that need to be read
                        queue->R[0] = match_offset;
                } else {
                        /* Explicit offset  */
 
                        /* Look up the number of extra bits that need to be read
-                        * to decode offsets with this position slot.  */
-                       num_extra_bits = lzx_get_num_extra_bits(position_slot);
+                        * to decode offsets with this offset slot.  */
+                       num_extra_bits = lzx_extra_offset_bits[offset_slot];
 
 
-                       /* Start with the position slot base value.  */
-                       match_offset = lzx_position_base[position_slot];
+                       /* Start with the offset slot base value.  */
+                       match_offset = lzx_offset_slot_base[offset_slot];
 
                        /* In aligned offset blocks, the low-order 3 bits of
                         * each offset are encoded using the aligned offset
 
                        /* In aligned offset blocks, the low-order 3 bits of
                         * each offset are encoded using the aligned offset
index 4b4e74d8cabe7f92ec6f12e5af28b276c70f04bd..f39d54758969f2884f37ee2e8d50b3164152b7c4 100644 (file)
@@ -28,8 +28,8 @@
 #  include "config.h"
 #endif
 
 #  include "config.h"
 #endif
 
-#include "wimlib/compressor_ops.h"
 #include "wimlib/compress_common.h"
 #include "wimlib/compress_common.h"
+#include "wimlib/compressor_ops.h"
 #include "wimlib/endianness.h"
 #include "wimlib/error.h"
 #include "wimlib/lz_mf.h"
 #include "wimlib/endianness.h"
 #include "wimlib/error.h"
 #include "wimlib/lz_mf.h"
@@ -37,6 +37,7 @@
 #include "wimlib/xpress.h"
 
 #include <string.h>
 #include "wimlib/xpress.h"
 
 #include <string.h>
+#include <limits.h>
 
 #define XPRESS_CACHE_PER_POS           8
 #define XPRESS_OPTIM_ARRAY_LENGTH      4096
 
 #define XPRESS_CACHE_PER_POS           8
 #define XPRESS_OPTIM_ARRAY_LENGTH      4096
@@ -45,21 +46,14 @@ struct xpress_compressor;
 struct xpress_item;
 struct xpress_mc_pos_data;
 
 struct xpress_item;
 struct xpress_mc_pos_data;
 
+/* Internal compression parameters  */
 struct xpress_compressor_params {
 
 struct xpress_compressor_params {
 
-       /* Only used when choose_items_func == xpress_choose_items_near_optimal  */
-       u32 num_optim_passes;
+       /* See xpress_choose_items()  */
+       u32 (*choose_items_func)(struct xpress_compressor *);
 
 
-       /* Given the data to compress (c->cur_window, c->cur_window_size),
-        * 'choose_items_func' fills in c->chosen_items with the intermediate
-        * representation of the match/literal sequence to output.  Also fills
-        * in c->codewords and c->lens to provide the Huffman code with which
-        * these items should be output.
-        *
-        * Returns the number of items written to c->chosen_items.  This can be
-        * at most c->cur_window_size.  (The worst case is all literals, no
-        * matches.)  */
-       u32 (*choose_items_func)(struct xpress_compressor *c);
+       /* For near-optimal parsing only  */
+       u32 num_optim_passes;
 
        /* Match-finding algorithm and parameters  */
        enum lz_mf_algo mf_algo;
 
        /* Match-finding algorithm and parameters  */
        enum lz_mf_algo mf_algo;
@@ -67,35 +61,32 @@ struct xpress_compressor_params {
        u32 nice_match_length;
 };
 
        u32 nice_match_length;
 };
 
-/* XPRESS compressor state.  */
+/* State of the XPRESS compressor  */
 struct xpress_compressor {
 
 struct xpress_compressor {
 
-       /* Parameters determined based on the compression level.  */
+       /* Internal compression parameters  */
        struct xpress_compressor_params params;
 
        struct xpress_compressor_params params;
 
+       /* Data currently being compressed  */
+       const u8 *cur_window;
+       u32 cur_window_size;
+
        /* Lempel-Ziv match-finder  */
        struct lz_mf *mf;
 
        /* Optimal parsing data  */
        unsigned (*get_matches_func)(struct xpress_compressor *,
                                     const struct lz_match **);
        /* Lempel-Ziv match-finder  */
        struct lz_mf *mf;
 
        /* Optimal parsing data  */
        unsigned (*get_matches_func)(struct xpress_compressor *,
                                     const struct lz_match **);
-       void (*skip_bytes_func)(struct xpress_compressor *, u32 n);
-       const u8 *cur_window_ptr;
+       void (*skip_bytes_func)(struct xpress_compressor *, unsigned n);
        struct lz_match *cached_matches;
        struct lz_match *cache_ptr;
        struct lz_match *cache_limit;
        struct xpress_mc_pos_data *optimum;
        struct lz_match *cached_matches;
        struct lz_match *cache_ptr;
        struct lz_match *cache_limit;
        struct xpress_mc_pos_data *optimum;
-       unsigned optimum_cur_idx;
-       unsigned optimum_end_idx;
        u8 costs[XPRESS_NUM_SYMBOLS];
 
        /* The selected sequence of matches/literals  */
        struct xpress_item *chosen_items;
 
        u8 costs[XPRESS_NUM_SYMBOLS];
 
        /* The selected sequence of matches/literals  */
        struct xpress_item *chosen_items;
 
-       /* Data currently being compressed  */
-       const u8 *cur_window;
-       u32 cur_window_size;
-
        /* Symbol frequency counters  */
        u32 freqs[XPRESS_NUM_SYMBOLS];
 
        /* Symbol frequency counters  */
        u32 freqs[XPRESS_NUM_SYMBOLS];
 
@@ -104,31 +95,51 @@ struct xpress_compressor {
        u8 lens[XPRESS_NUM_SYMBOLS];
 };
 
        u8 lens[XPRESS_NUM_SYMBOLS];
 };
 
-/* Match-chooser position data.
- * See corresponding declaration in lzx-compress.c for more information.  */
+/* Intermediate XPRESS match/literal format  */
+struct xpress_item {
+
+       /* Bits 0  -  8: Symbol
+        * Bits 9  - 24: Length - XPRESS_MIN_MATCH_LEN
+        * Bits 25 - 28: Number of extra offset bits
+        * Bits 29+    : Extra offset bits  */
+
+       u64 data;
+};
+
+/*
+ * Match chooser position data:
+ *
+ * An array of these structures is used during the near-optimal match-choosing
+ * algorithm.  They correspond to consecutive positions in the window and are
+ * used to keep track of the cost to reach each position, and the match/literal
+ * choices that need to be chosen to reach that position.
+ */
 struct xpress_mc_pos_data {
 struct xpress_mc_pos_data {
+
+       /* The cost, in bits, of the lowest-cost path that has been found to
+        * reach this position.  This can change as progressively lower cost
+        * paths are found to reach this position.  */
        u32 cost;
        u32 cost;
-#define MC_INFINITE_COST ((u32)~0UL)
-
-       union {
-               struct {
-                       u32 link;
-                       u32 match_offset;
-               } prev;
-               struct {
-                       u32 link;
-                       u32 match_offset;
-               } next;
-       };
-};
+#define MC_INFINITE_COST UINT32_MAX
 
 
-/* Intermediate XPRESS match/literal representation.  */
-struct xpress_item {
-       u16 adjusted_len;  /* Match length minus XPRESS_MIN_MATCH_LEN */
-       u16 offset;        /* Match offset */
-       /* For literals, offset == 0 and adjusted_len is the literal byte.  */
+       /* The match or literal that was taken to reach this position.  This can
+        * change as progressively lower cost paths are found to reach this
+        * position.
+        *
+        * This variable is divided into two bitfields.
+        *
+        * Literals:
+        *      Low bits are 1, high bits are the literal.
+        *
+        * Matches:
+        *      Low bits are the match length, high bits are the offset.
+        */
+       u32 mc_item_data;
+#define MC_OFFSET_SHIFT 16
+#define MC_LEN_MASK ((1 << MC_OFFSET_SHIFT) - 1)
 };
 
 };
 
+
 /*
  * Structure to keep track of the current state of sending data to the
  * compressed output buffer.
 /*
  * Structure to keep track of the current state of sending data to the
  * compressed output buffer.
@@ -194,7 +205,7 @@ xpress_init_output(struct xpress_output_bitstream *os, void *buffer, u32 size)
  * If the output buffer space is exhausted, then the bits will be ignored, and
  * xpress_flush_output() will return 0 when it gets called.
  */
  * If the output buffer space is exhausted, then the bits will be ignored, and
  * xpress_flush_output() will return 0 when it gets called.
  */
-static _always_inline_attribute void
+static inline void
 xpress_write_bits(struct xpress_output_bitstream *os,
                  const u32 bits, const unsigned int num_bits)
 {
 xpress_write_bits(struct xpress_output_bitstream *os,
                  const u32 bits, const unsigned int num_bits)
 {
@@ -218,7 +229,7 @@ xpress_write_bits(struct xpress_output_bitstream *os,
 /*
  * Interweave a literal byte into the output bitstream.
  */
 /*
  * Interweave a literal byte into the output bitstream.
  */
-static _always_inline_attribute void
+static inline void
 xpress_write_byte(struct xpress_output_bitstream *os, u8 byte)
 {
        if (os->next_byte < os->end)
 xpress_write_byte(struct xpress_output_bitstream *os, u8 byte)
 {
        if (os->next_byte < os->end)
@@ -241,31 +252,41 @@ xpress_flush_output(struct xpress_output_bitstream *os)
        return os->next_byte - os->start;
 }
 
        return os->next_byte - os->start;
 }
 
-/* Output an XPRESS match.  */
-static void
-xpress_write_match(struct xpress_item match, struct xpress_output_bitstream *os,
-                  const u32 codewords[], const u8 lens[])
+/* Output a match or literal.  */
+static inline void
+xpress_write_item(struct xpress_item item, struct xpress_output_bitstream *os,
+                 const u32 codewords[], const u8 lens[])
 {
 {
-       unsigned len_hdr = min(match.adjusted_len, 0xf);
-       unsigned offset_bsr = bsr32(match.offset);
-       unsigned sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
+       u64 data = item.data;
+       unsigned symbol;
+       unsigned adjusted_len;
+       unsigned num_extra_bits;
+       unsigned extra_bits;
 
 
-       /* Huffman symbol  */
-       xpress_write_bits(os, codewords[sym], lens[sym]);
+       symbol = data & 0x1FF;
+
+       xpress_write_bits(os, codewords[symbol], lens[symbol]);
+
+       if (symbol < XPRESS_NUM_CHARS)  /* Literal?  */
+               return;
+
+       adjusted_len = (data >> 9) & 0xFFFF;
 
        /* If length >= 18, one extra length byte.
         * If length >= 273, three (total) extra length bytes.  */
 
        /* If length >= 18, one extra length byte.
         * If length >= 273, three (total) extra length bytes.  */
-       if (match.adjusted_len >= 0xf) {
-               u8 byte1 = min(match.adjusted_len - 0xf, 0xff);
+       if (adjusted_len >= 0xf) {
+               u8 byte1 = min(adjusted_len - 0xf, 0xff);
                xpress_write_byte(os, byte1);
                if (byte1 == 0xff) {
                xpress_write_byte(os, byte1);
                if (byte1 == 0xff) {
-                       xpress_write_byte(os, match.adjusted_len & 0xff);
-                       xpress_write_byte(os, match.adjusted_len >> 8);
+                       xpress_write_byte(os, adjusted_len & 0xff);
+                       xpress_write_byte(os, adjusted_len >> 8);
                }
        }
 
                }
        }
 
-       /* Offset bits  */
-       xpress_write_bits(os, match.offset ^ (1U << offset_bsr), offset_bsr);
+       num_extra_bits = (data >> 25) & 0xF;
+       extra_bits = data >> 29;
+
+       xpress_write_bits(os, extra_bits, num_extra_bits);
 }
 
 /* Output a sequence of XPRESS matches and literals.  */
 }
 
 /* Output a sequence of XPRESS matches and literals.  */
@@ -274,16 +295,9 @@ xpress_write_items(struct xpress_output_bitstream *os,
                   const struct xpress_item items[], u32 num_items,
                   const u32 codewords[], const u8 lens[])
 {
                   const struct xpress_item items[], u32 num_items,
                   const u32 codewords[], const u8 lens[])
 {
-       for (u32 i = 0; i < num_items; i++) {
-               if (items[i].offset) {
-                       /* Match  */
-                       xpress_write_match(items[i], os, codewords, lens);
-               } else {
-                       /* Literal  */
-                       unsigned lit = items[i].adjusted_len;
-                       xpress_write_bits(os, codewords[lit], lens[lit]);
-               }
-       }
+       for (u32 i = 0; i < num_items; i++)
+               xpress_write_item(items[i], os, codewords, lens);
+
        /* End-of-data symbol (required for MS compatibility)  */
        xpress_write_bits(os, codewords[XPRESS_END_OF_DATA], lens[XPRESS_END_OF_DATA]);
 }
        /* End-of-data symbol (required for MS compatibility)  */
        xpress_write_bits(os, codewords[XPRESS_END_OF_DATA], lens[XPRESS_END_OF_DATA]);
 }
@@ -298,28 +312,108 @@ xpress_make_huffman_code(struct xpress_compressor *c)
                                    c->freqs, c->lens, c->codewords);
 }
 
                                    c->freqs, c->lens, c->codewords);
 }
 
-/* Account for the Huffman symbol that would be produced by outputting the
- * specified literal.  Returns the intermediate representation of the literal.
- */
-static inline struct xpress_item
-xpress_tally_literal(u8 lit, u32 freqs[])
+/* Tally, and optionally record, the specified literal byte.  */
+static inline void
+xpress_declare_literal(struct xpress_compressor *c, unsigned literal,
+                      struct xpress_item **next_chosen_item)
 {
 {
-       freqs[lit]++;
-       return (struct xpress_item) { .offset = 0, .adjusted_len = lit };
+       c->freqs[literal]++;
+
+       if (next_chosen_item) {
+               *(*next_chosen_item)++ = (struct xpress_item) {
+                       .data = literal,
+               };
+       }
 }
 
 }
 
-/* Account for the Huffman symbol that would be produced by outputting the
- * specified match.  Returns the intermediate representation of the match.  */
-static inline struct xpress_item
-xpress_tally_match(u32 len, u32 offset, u32 freqs[])
+/* Tally, and optionally record, the specified match.  */
+static inline void
+xpress_declare_match(struct xpress_compressor *c,
+                    unsigned len, unsigned offset,
+                    struct xpress_item **next_chosen_item)
 {
 {
-       u32 adjusted_len = len - XPRESS_MIN_MATCH_LEN;
+       unsigned adjusted_len = len - XPRESS_MIN_MATCH_LEN;
        unsigned len_hdr = min(adjusted_len, 0xf);
        unsigned len_hdr = min(adjusted_len, 0xf);
-       unsigned sym = XPRESS_NUM_CHARS + ((bsr32(offset) << 4) | len_hdr);
+       unsigned offset_bsr = bsr32(offset);
+       unsigned sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
+
+       c->freqs[sym]++;
+
+       if (next_chosen_item) {
+               *(*next_chosen_item)++ = (struct xpress_item) {
+                       .data = (u64)sym |
+                               ((u64)adjusted_len << 9) |
+                               ((u64)offset_bsr << 25) |
+                               ((u64)(offset ^ (1U << offset_bsr)) << 29),
+               };
+       }
+}
 
 
-       freqs[sym]++;
-       return (struct xpress_item) { .offset = offset,
-                                     .adjusted_len = adjusted_len };
+/* Tally, and optionally record, the specified match or literal.  */
+static inline void
+xpress_declare_item(struct xpress_compressor *c, u32 mc_item_data,
+                   struct xpress_item **next_chosen_item)
+{
+       unsigned len = mc_item_data & MC_LEN_MASK;
+       unsigned offset_data = mc_item_data >> MC_OFFSET_SHIFT;
+
+       if (len == 1)
+               xpress_declare_literal(c, offset_data, next_chosen_item);
+       else
+               xpress_declare_match(c, len, offset_data, next_chosen_item);
+}
+
+static inline void
+xpress_record_item_list(struct xpress_compressor *c,
+                       struct xpress_mc_pos_data *cur_optimum_ptr,
+                       struct xpress_item **next_chosen_item)
+{
+       struct xpress_mc_pos_data *end_optimum_ptr;
+       u32 saved_item;
+       u32 item;
+
+       /* The list is currently in reverse order (last item to first item).
+        * Reverse it.  */
+       end_optimum_ptr = cur_optimum_ptr;
+       saved_item = cur_optimum_ptr->mc_item_data;
+       do {
+               item = saved_item;
+               cur_optimum_ptr -= item & MC_LEN_MASK;
+               saved_item = cur_optimum_ptr->mc_item_data;
+               cur_optimum_ptr->mc_item_data = item;
+       } while (cur_optimum_ptr != c->optimum);
+
+       /* Walk the list of items from beginning to end, tallying and recording
+        * each item.  */
+       do {
+               xpress_declare_item(c, cur_optimum_ptr->mc_item_data, next_chosen_item);
+               cur_optimum_ptr += (cur_optimum_ptr->mc_item_data) & MC_LEN_MASK;
+       } while (cur_optimum_ptr != end_optimum_ptr);
+}
+
+static inline void
+xpress_tally_item_list(struct xpress_compressor *c,
+                      struct xpress_mc_pos_data *cur_optimum_ptr)
+{
+       /* Since we're just tallying the items, we don't need to reverse the
+        * list.  Processing the items in reverse order is fine.  */
+       do {
+               xpress_declare_item(c, cur_optimum_ptr->mc_item_data, NULL);
+               cur_optimum_ptr -= (cur_optimum_ptr->mc_item_data & MC_LEN_MASK);
+       } while (cur_optimum_ptr != c->optimum);
+}
+
+/* Tally, and optionally (if next_chosen_item != NULL) record, in order, all
+ * items in the current list of items found by the match-chooser.  */
+static void
+xpress_declare_item_list(struct xpress_compressor *c,
+                        struct xpress_mc_pos_data *cur_optimum_ptr,
+                        struct xpress_item **next_chosen_item)
+{
+       if (next_chosen_item)
+               xpress_record_item_list(c, cur_optimum_ptr, next_chosen_item);
+       else
+               xpress_tally_item_list(c, cur_optimum_ptr);
 }
 
 static unsigned
 }
 
 static unsigned
@@ -339,7 +433,6 @@ xpress_get_matches_fillcache(struct xpress_compressor *c,
        } else {
                num_matches = 0;
        }
        } else {
                num_matches = 0;
        }
-       c->cur_window_ptr++;
        *matches_ret = matches;
        return num_matches;
 }
        *matches_ret = matches;
        return num_matches;
 }
@@ -354,13 +447,12 @@ xpress_get_matches_usecache(struct xpress_compressor *c,
 
        cache_ptr = c->cache_ptr;
        matches = cache_ptr + 1;
 
        cache_ptr = c->cache_ptr;
        matches = cache_ptr + 1;
-       if (likely(cache_ptr <= c->cache_limit)) {
+       if (cache_ptr <= c->cache_limit) {
                num_matches = cache_ptr->len;
                c->cache_ptr = matches + num_matches;
        } else {
                num_matches = 0;
        }
                num_matches = cache_ptr->len;
                c->cache_ptr = matches + num_matches;
        } else {
                num_matches = 0;
        }
-       c->cur_window_ptr++;
        *matches_ret = matches;
        return num_matches;
 }
        *matches_ret = matches;
        return num_matches;
 }
@@ -377,7 +469,6 @@ xpress_get_matches_usecache_nocheck(struct xpress_compressor *c,
        matches = cache_ptr + 1;
        num_matches = cache_ptr->len;
        c->cache_ptr = matches + num_matches;
        matches = cache_ptr + 1;
        num_matches = cache_ptr->len;
        c->cache_ptr = matches + num_matches;
-       c->cur_window_ptr++;
        *matches_ret = matches;
        return num_matches;
 }
        *matches_ret = matches;
        return num_matches;
 }
@@ -386,7 +477,6 @@ static unsigned
 xpress_get_matches_noncaching(struct xpress_compressor *c,
                              const struct lz_match **matches_ret)
 {
 xpress_get_matches_noncaching(struct xpress_compressor *c,
                              const struct lz_match **matches_ret)
 {
-       c->cur_window_ptr++;
        *matches_ret = c->cached_matches;
        return lz_mf_get_matches(c->mf, c->cached_matches);
 }
        *matches_ret = c->cached_matches;
        return lz_mf_get_matches(c->mf, c->cached_matches);
 }
@@ -394,6 +484,8 @@ xpress_get_matches_noncaching(struct xpress_compressor *c,
 /*
  * Find matches at the next position in the window.
  *
 /*
  * Find matches at the next position in the window.
  *
+ * This uses a wrapper function around the underlying match-finder.
+ *
  * Returns the number of matches found and sets *matches_ret to point to the
  * matches array.  The matches will be sorted by strictly increasing length and
  * offset.
  * Returns the number of matches found and sets *matches_ret to point to the
  * matches array.  The matches will be sorted by strictly increasing length and
  * offset.
@@ -406,14 +498,13 @@ xpress_get_matches(struct xpress_compressor *c,
 }
 
 static void
 }
 
 static void
-xpress_skip_bytes_fillcache(struct xpress_compressor *c, u32 n)
+xpress_skip_bytes_fillcache(struct xpress_compressor *c, unsigned n)
 {
        struct lz_match *cache_ptr;
 
 {
        struct lz_match *cache_ptr;
 
-       c->cur_window_ptr += n;
        cache_ptr = c->cache_ptr;
        lz_mf_skip_positions(c->mf, n);
        cache_ptr = c->cache_ptr;
        lz_mf_skip_positions(c->mf, n);
-       if (likely(cache_ptr <= c->cache_limit)) {
+       if (cache_ptr <= c->cache_limit) {
                do {
                        cache_ptr->len = 0;
                        cache_ptr += 1;
                do {
                        cache_ptr->len = 0;
                        cache_ptr += 1;
@@ -423,11 +514,10 @@ xpress_skip_bytes_fillcache(struct xpress_compressor *c, u32 n)
 }
 
 static void
 }
 
 static void
-xpress_skip_bytes_usecache(struct xpress_compressor *c, u32 n)
+xpress_skip_bytes_usecache(struct xpress_compressor *c, unsigned n)
 {
        struct lz_match *cache_ptr;
 
 {
        struct lz_match *cache_ptr;
 
-       c->cur_window_ptr += n;
        cache_ptr = c->cache_ptr;
        if (likely(cache_ptr <= c->cache_limit)) {
                do {
        cache_ptr = c->cache_ptr;
        if (likely(cache_ptr <= c->cache_limit)) {
                do {
@@ -438,11 +528,10 @@ xpress_skip_bytes_usecache(struct xpress_compressor *c, u32 n)
 }
 
 static void
 }
 
 static void
-xpress_skip_bytes_usecache_nocheck(struct xpress_compressor *c, u32 n)
+xpress_skip_bytes_usecache_nocheck(struct xpress_compressor *c, unsigned n)
 {
        struct lz_match *cache_ptr;
 
 {
        struct lz_match *cache_ptr;
 
-       c->cur_window_ptr += n;
        cache_ptr = c->cache_ptr;
        do {
                cache_ptr += 1 + cache_ptr->len;
        cache_ptr = c->cache_ptr;
        do {
                cache_ptr += 1 + cache_ptr->len;
@@ -451,310 +540,282 @@ xpress_skip_bytes_usecache_nocheck(struct xpress_compressor *c, u32 n)
 }
 
 static void
 }
 
 static void
-xpress_skip_bytes_noncaching(struct xpress_compressor *c, u32 n)
+xpress_skip_bytes_noncaching(struct xpress_compressor *c, unsigned n)
 {
 {
-       c->cur_window_ptr += n;
        lz_mf_skip_positions(c->mf, n);
 }
 
 /*
  * Skip the specified number of positions in the window (don't search for
  * matches at them).
        lz_mf_skip_positions(c->mf, n);
 }
 
 /*
  * Skip the specified number of positions in the window (don't search for
  * matches at them).
+ *
+ * This uses a wrapper function around the underlying match-finder.
  */
 static inline void
  */
 static inline void
-xpress_skip_bytes(struct xpress_compressor *c, u32 n)
+xpress_skip_bytes(struct xpress_compressor *c, unsigned n)
 {
        return (*c->skip_bytes_func)(c, n);
 }
 
 {
        return (*c->skip_bytes_func)(c, n);
 }
 
-/*
- * Returns the cost, in bits, required to output the literal from the previous
- * window position (the position at which matches were last searched).
- */
-static inline u32
-xpress_prev_literal_cost(const struct xpress_compressor *c)
+/* Set default XPRESS Huffman symbol costs to bootstrap the iterative
+ * optimization algorithm.  */
+static void
+xpress_set_default_costs(u8 costs[])
+{
+       unsigned i;
+
+       /* Literal symbols  */
+       for (i = 0; i < XPRESS_NUM_CHARS; i++)
+               costs[i] = 8;
+
+       /* Match symbols  */
+       for (; i < XPRESS_NUM_SYMBOLS; i++)
+               costs[i] = 10;
+}
+
+/* Copy the Huffman codeword lengths array @lens to the Huffman symbol costs
+ * array @costs, but also assign a default cost to each 0-length (unused)
+ * codeword.  */
+static void
+xpress_set_costs(u8 costs[], const u8 lens[])
 {
 {
-       return c->costs[*(c->cur_window_ptr - 1)];
+       for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i++)
+               costs[i] = lens[i] ? lens[i] : XPRESS_MAX_CODEWORD_LEN;
 }
 
 /*
 }
 
 /*
- * Reverse the linked list of near-optimal matches so that they can be returned
- * in forwards order.
+ * Consider coding each match in @matches.
  *
  *
- * Returns the first match in the list.
+ * @matches must be sorted by strictly increasing length and strictly
+ * increasing offset.  This is guaranteed by the match-finder.
+ *
+ * We consider each length from the minimum (2) to the longest
+ * (matches[num_matches - 1].len).  For each length, we consider only
+ * the smallest offset for which that length is available.  Although
+ * this is not guaranteed to be optimal due to the possibility of a
+ * larger offset costing less than a smaller offset to code, this is a
+ * very useful heuristic.
  */
  */
-static struct lz_match
-xpress_match_chooser_reverse_list(struct xpress_compressor *c, unsigned cur_pos)
+static inline void
+xpress_consider_matches(struct xpress_compressor *c,
+                       struct xpress_mc_pos_data *cur_optimum_ptr,
+                       const struct lz_match matches[],
+                       unsigned num_matches)
 {
 {
-       unsigned prev_link, saved_prev_link;
-       u32 prev_match_offset, saved_prev_match_offset;
-
-       c->optimum_end_idx = cur_pos;
-
-       saved_prev_link = c->optimum[cur_pos].prev.link;
-       saved_prev_match_offset = c->optimum[cur_pos].prev.match_offset;
-
-       do {
-               prev_link = saved_prev_link;
-               prev_match_offset = saved_prev_match_offset;
-
-               saved_prev_link = c->optimum[prev_link].prev.link;
-               saved_prev_match_offset = c->optimum[prev_link].prev.match_offset;
-
-               c->optimum[prev_link].next.link = cur_pos;
-               c->optimum[prev_link].next.match_offset = prev_match_offset;
-
-               cur_pos = prev_link;
-       } while (cur_pos != 0);
-
-       c->optimum_cur_idx = c->optimum[0].next.link;
+       unsigned i = 0;
+       unsigned len = XPRESS_MIN_MATCH_LEN;
+       u32 cost;
+       u32 position_cost;
+       unsigned offset;
+       unsigned offset_bsr;
+       unsigned adjusted_len;
+       unsigned len_hdr;
+       unsigned sym;
+
+       if (matches[num_matches - 1].len < 0xf + XPRESS_MIN_MATCH_LEN) {
+               /* All lengths are small.  Optimize accordingly.  */
+               do {
+                       offset = matches[i].offset;
+                       offset_bsr = bsr32(offset);
+                       len_hdr = len - XPRESS_MIN_MATCH_LEN;
+                       sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
 
 
-       return (struct lz_match)
-               { .len = c->optimum_cur_idx,
-                 .offset = c->optimum[0].next.match_offset,
-               };
+                       position_cost = cur_optimum_ptr->cost + offset_bsr;
+                       do {
+                               cost = position_cost + c->costs[sym];
+                               if (cost < (cur_optimum_ptr + len)->cost) {
+                                       (cur_optimum_ptr + len)->cost = cost;
+                                       (cur_optimum_ptr + len)->mc_item_data =
+                                               (offset << MC_OFFSET_SHIFT) | len;
+                               }
+                               sym++;
+                       } while (++len <= matches[i].len);
+               } while (++i != num_matches);
+       } else {
+               /* Some lengths are big.  */
+               do {
+                       offset = matches[i].offset;
+                       offset_bsr = bsr32(offset);
+                       position_cost = cur_optimum_ptr->cost + offset_bsr;
+                       do {
+                               adjusted_len = len - XPRESS_MIN_MATCH_LEN;
+                               len_hdr = min(adjusted_len, 0xf);
+                               sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
+
+                               cost = position_cost + c->costs[sym];
+                               if (adjusted_len >= 0xf) {
+                                       cost += 8;
+                                       if (adjusted_len - 0xf >= 0xff)
+                                               cost += 16;
+                               }
+
+                               if (cost < (cur_optimum_ptr + len)->cost) {
+                                       (cur_optimum_ptr + len)->cost = cost;
+                                       (cur_optimum_ptr + len)->mc_item_data =
+                                               (offset << MC_OFFSET_SHIFT) | len;
+                               }
+                       } while (++len <= matches[i].len);
+               } while (++i != num_matches);
+       }
 }
 
 /*
 }
 
 /*
- * Near-optimal parsing.
+ * The main near-optimal parsing routine.
+ *
+ * Briefly, the algorithm does an approximate minimum-cost path search to find a
+ * "near-optimal" sequence of matches and literals to output, based on the
+ * current cost model.  The algorithm steps forward, position by position (byte
+ * by byte), and updates the minimum cost path to reach each later position that
+ * can be reached using a match or literal from the current position.  This is
+ * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
+ * the graph edges are possible matches/literals to code, and the cost of each
+ * edge is the estimated number of bits that will be required to output the
+ * corresponding match or literal.  But one difference is that we actually
+ * compute the lowest-cost path in pieces, where each piece is terminated when
+ * there are no choices to be made.
  *
  *
- * This does a forward lowest-cost path search.  The search is terminated when a
- * sufficiently long match is found, when the search reaches a position with no
- * alternatives, or when the temporary 'optimum' array fills up.  After
- * termination of the search, matches/literals will be returned one by one by
- * successive calls to this function.  Once all the matches/literals are used
- * up, the next call to this function will begin a new search.
+ * If next_chosen_item != NULL, then all items chosen will be recorded (saved in
+ * the chosen_items array).  Otherwise, all items chosen will only be tallied
+ * (symbol frequencies tallied in c->freqs).
  */
  */
-static struct lz_match
-xpress_choose_near_optimal_item(struct xpress_compressor *c)
+static void
+xpress_optim_pass(struct xpress_compressor *c,
+                 struct xpress_item **next_chosen_item)
 {
 {
+       const u8 *window_end;
+       const u8 *window_ptr;
+       struct xpress_mc_pos_data *cur_optimum_ptr;
+       struct xpress_mc_pos_data *end_optimum_ptr;
        const struct lz_match *matches;
        unsigned num_matches;
        const struct lz_match *matches;
        unsigned num_matches;
-       struct lz_match match;
-       unsigned cur_pos;
-       unsigned end_pos;
-       struct xpress_mc_pos_data * const optimum = c->optimum;
-
-       if (c->optimum_cur_idx != c->optimum_end_idx) {
-               /* Return previously computed match or literal.  */
-               match.len = optimum[c->optimum_cur_idx].next.link -
-                                   c->optimum_cur_idx;
-               match.offset = optimum[c->optimum_cur_idx].next.match_offset;
-
-               c->optimum_cur_idx = optimum[c->optimum_cur_idx].next.link;
-               return match;
-       }
-
-       c->optimum_cur_idx = 0;
-       c->optimum_end_idx = 0;
-
-       num_matches = xpress_get_matches(c, &matches);
-
-       if (num_matches == 0)
-               return (struct lz_match) {};
-
-       if (matches[num_matches - 1].len >= c->params.nice_match_length) {
-               /* Take the long match immediately.  */
-               xpress_skip_bytes(c, matches[num_matches - 1].len - 1);
-               return matches[num_matches - 1];
-       }
+       unsigned longest_len;
+       unsigned literal;
+       u32 cost;
 
 
-       /* Consider coding a literal.  */
-       optimum[1].cost = xpress_prev_literal_cost(c);
-       optimum[1].prev.link = 0;
+       window_ptr = c->cur_window;
+       window_end = &c->cur_window[c->cur_window_size];
 
 
-       optimum[2].cost = MC_INFINITE_COST;
+begin:
+       /* Start building a new list of items, which will correspond to the next
+        * piece of the overall minimum-cost path.  */
 
 
-       {
-               /* Consider coding a match.  Cost evaluation is hand-inlined so
-                * that we can do some performance hacks.  */
+       if (window_ptr == window_end)
+               return;
 
 
-               unsigned i = 0;
-               unsigned len = 3;
-               struct xpress_mc_pos_data *optimum_ptr = &optimum[len];
+       cur_optimum_ptr = c->optimum;
+       cur_optimum_ptr->cost = 0;
+       end_optimum_ptr = cur_optimum_ptr;
 
 
-               if (matches[num_matches - 1].len < 0xf + XPRESS_MIN_MATCH_LEN) {
-                       do {
-                               u32 offset = matches[i].offset;
-                               u32 offset_bsr = bsr32(offset);
-                               unsigned len_hdr = len - XPRESS_MIN_MATCH_LEN;
-                               unsigned sym = XPRESS_NUM_CHARS +
-                                               ((offset_bsr << 4) | len_hdr);
-                               do {
-                                       optimum_ptr->prev.link = 0;
-                                       optimum_ptr->prev.match_offset = offset;
-                                       optimum_ptr->cost = offset_bsr + c->costs[sym];
-                                       sym++;
-                                       optimum_ptr++;
-                               } while (++len <= matches[i].len);
-                       } while (++i != num_matches);
-               } else {
-                       do {
-                               u32 offset = matches[i].offset;
-                               u32 offset_bsr = bsr32(offset);
-                               do {
-                                       u32 adjusted_len = len - XPRESS_MIN_MATCH_LEN;
-                                       unsigned len_hdr = min(adjusted_len, 0xf);
-                                       unsigned sym = XPRESS_NUM_CHARS +
-                                                       ((offset_bsr << 4) | len_hdr);
-                                       u32 cost = offset_bsr + c->costs[sym];
-                                       if (adjusted_len >= 0xf) {
-                                               cost += 8;
-                                               if (adjusted_len - 0xf >= 0xff)
-                                                       cost += 16;
-                                       }
-
-                                       optimum_ptr->prev.link = 0;
-                                       optimum_ptr->prev.match_offset = offset;
-                                       optimum_ptr->cost = cost;
-                                       optimum_ptr++;
-                               } while (++len <= matches[i].len);
-                       } while (++i != num_matches);
-               }
-       }
-
-       end_pos = matches[num_matches - 1].len;
-       cur_pos = 1;
-       do {
-               u32 cost;
-               u32 longest_len;
+       /* The following loop runs once for each per byte in the window, except
+        * in a couple shortcut cases.  */
+       for (;;) {
 
 
+               /* Find matches with the current position.  */
                num_matches = xpress_get_matches(c, &matches);
 
                if (num_matches) {
                num_matches = xpress_get_matches(c, &matches);
 
                if (num_matches) {
+
                        longest_len = matches[num_matches - 1].len;
                        longest_len = matches[num_matches - 1].len;
-                       if (longest_len >= c->params.nice_match_length) {
-                               /* Take the long match immediately.  */
-                               match = xpress_match_chooser_reverse_list(c, cur_pos);
 
 
-                               optimum[cur_pos].next.match_offset =
-                                       matches[num_matches - 1].offset;
-                               optimum[cur_pos].next.link = cur_pos + longest_len;
-                               c->optimum_end_idx = cur_pos + longest_len;
+                       /* If there's a very long match, choose it immediately.
+                        */
+                       if (longest_len >= c->params.nice_match_length) {
 
                                xpress_skip_bytes(c, longest_len - 1);
 
                                xpress_skip_bytes(c, longest_len - 1);
+                               window_ptr += longest_len;
 
 
-                               return match;
-                       }
-               } else {
-                       longest_len = 1;
-               }
+                               if (cur_optimum_ptr != c->optimum)
+                                       xpress_declare_item_list(c, cur_optimum_ptr,
+                                                                next_chosen_item);
 
 
-               while (end_pos < cur_pos + longest_len)
-                       optimum[++end_pos].cost = MC_INFINITE_COST;
+                               xpress_declare_match(c, longest_len,
+                                                    matches[num_matches - 1].offset,
+                                                    next_chosen_item);
+                               goto begin;
+                       }
 
 
-               /* Consider coding a literal.  */
-               cost = optimum[cur_pos].cost + xpress_prev_literal_cost(c);
-               if (cost < optimum[cur_pos + 1].cost) {
-                       optimum[cur_pos + 1].cost = cost;
-                       optimum[cur_pos + 1].prev.link = cur_pos;
-               }
+                       /* If reaching any positions for the first time,
+                        * initialize their costs to "infinity".  */
+                       while (end_optimum_ptr < cur_optimum_ptr + longest_len)
+                               (++end_optimum_ptr)->cost = MC_INFINITE_COST;
 
 
-               if (num_matches) {
-                       /* Consider coding a match.  Cost evaluation is
-                        * hand-inlined so that we can do some performance
-                        * hacks.  */
-                       unsigned i = 0;
-                       unsigned len = 3;
-                       struct xpress_mc_pos_data *optimum_ptr = &optimum[cur_pos + 3];
-                       u32 cur_cost = optimum[cur_pos].cost;
-
-                       if (matches[num_matches - 1].len < 0xf + XPRESS_MIN_MATCH_LEN) {
-                               do {
-                                       u32 offset = matches[i].offset;
-                                       u32 offset_bsr = bsr32(offset);
-                                       unsigned len_hdr = len - XPRESS_MIN_MATCH_LEN;
-                                       unsigned sym = XPRESS_NUM_CHARS +
-                                                       ((offset_bsr << 4) | len_hdr);
-
-                                       u32 base_cost = cur_cost + offset_bsr;
-                                       do {
-                                               cost = base_cost + c->costs[sym];
-                                               if (cost < optimum_ptr->cost) {
-                                                       optimum_ptr->prev.link = cur_pos;
-                                                       optimum_ptr->prev.match_offset = offset;
-                                                       optimum_ptr->cost = cost;
-                                               }
-                                               sym++;
-                                               optimum_ptr++;
-                                       } while (++len <= matches[i].len);
-                               } while (++i != num_matches);
-                       } else {
-                               do {
-                                       u32 offset = matches[i].offset;
-                                       u32 offset_bsr = bsr32(offset);
-
-                                       u32 base_cost = cur_cost + offset_bsr;
-                                       do {
-                                               u32 adjusted_len = len - XPRESS_MIN_MATCH_LEN;
-                                               unsigned len_hdr = min(adjusted_len, 0xf);
-                                               unsigned sym = XPRESS_NUM_CHARS +
-                                                               ((offset_bsr << 4) | len_hdr);
-
-                                               cost = base_cost + c->costs[sym];
-                                               if (adjusted_len >= 0xf) {
-                                                       cost += 8;
-                                                       if (adjusted_len - 0xf >= 0xff)
-                                                               cost += 16;
-                                               }
-
-                                               if (cost < optimum_ptr->cost) {
-                                                       optimum_ptr->prev.link = cur_pos;
-                                                       optimum_ptr->prev.match_offset = offset;
-                                                       optimum_ptr->cost = cost;
-                                               }
-                                               optimum_ptr++;
-                                       } while (++len <= matches[i].len);
-                               } while (++i != num_matches);
+                       /* Consider coding a match.  */
+                       xpress_consider_matches(c, cur_optimum_ptr,
+                                               matches, num_matches);
+               } else {
+                       /* No matches found.  The only choice at this position
+                        * is to code a literal.  */
+
+                       if (end_optimum_ptr == cur_optimum_ptr) {
+                       #if 1
+                               /* Optimization for single literals.  */
+                               if (likely(cur_optimum_ptr == c->optimum)) {
+                                       xpress_declare_literal(c, *window_ptr++,
+                                                              next_chosen_item);
+                                       if (window_ptr == window_end)
+                                               return;
+                                       continue;
+                               }
+                       #endif
+                               (++end_optimum_ptr)->cost = MC_INFINITE_COST;
                        }
                }
 
                        }
                }
 
-               cur_pos++;
-
-       } while (cur_pos != end_pos && cur_pos != XPRESS_OPTIM_ARRAY_LENGTH);
-
-       return xpress_match_chooser_reverse_list(c, cur_pos);
-}
-
-/* Set default XPRESS Huffman symbol costs to kick-start the iterative
- * optimization algorithm.  */
-static void
-xpress_set_default_costs(u8 costs[])
-{
-       unsigned i;
+               /* Consider coding a literal.  */
+               literal = *window_ptr++;
+               cost = cur_optimum_ptr->cost + c->costs[literal];
+               if (cost < (cur_optimum_ptr + 1)->cost) {
+                       (cur_optimum_ptr + 1)->cost = cost;
+                       (cur_optimum_ptr + 1)->mc_item_data =
+                               ((u32)literal << MC_OFFSET_SHIFT) | 1;
+               }
 
 
-       for (i = 0; i < XPRESS_NUM_CHARS; i++)
-               costs[i] = 8;
+               /* Advance to the next position.  */
+               cur_optimum_ptr++;
+
+               /*
+                * This loop will terminate when either of the following
+                * conditions is true:
+                *
+                * (1) cur_optimum_ptr == end_optimum_ptr
+                *
+                *      There are no paths that extend beyond the current
+                *      position.  In this case, any path to a later position
+                *      must pass through the current position, so we can go
+                *      ahead and choose the list of items that led to this
+                *      position.
+                *
+                * (2) cur_optimum_ptr == &c->optimum[XPRESS_OPTIM_ARRAY_LENGTH]
+                *
+                *      This bounds the number of times the algorithm can step
+                *      forward before it is guaranteed to start choosing items.
+                *      This limits the memory usage.  But
+                *      XPRESS_OPTIM_ARRAY_LENGTH is high enough that on most
+                *      inputs this limit is never reached.
+                *
+                * Note: no check for end-of-block is needed because
+                * end-of-block will trigger condition (1).
+                */
+               if (cur_optimum_ptr == end_optimum_ptr ||
+                   cur_optimum_ptr == &c->optimum[XPRESS_OPTIM_ARRAY_LENGTH])
+                       break;
+       }
 
 
-       for (; i < XPRESS_NUM_SYMBOLS; i++)
-               costs[i] = 10;
-}
-
-/* Copy the Huffman codeword lengths array @lens to the Huffman symbol costs
- * array @costs, but also assign a default cost to each 0-length (unused)
- * codeword.  */
-static void
-xpress_set_costs(u8 costs[], const u8 lens[])
-{
-       for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i++)
-               costs[i] = lens[i] ? lens[i] : XPRESS_MAX_CODEWORD_LEN;
+       /* Choose the current list of items that constitute the minimum-cost
+        * path to the current position.  */
+       xpress_declare_item_list(c, cur_optimum_ptr, next_chosen_item);
+       goto begin;
 }
 
 /* Near-optimal parsing  */
 static u32
 }
 
 /* Near-optimal parsing  */
 static u32
-xpress_choose_items_near_optimal(struct xpress_compressor *c)
+xpress_choose_near_optimal_items(struct xpress_compressor *c)
 {
        u32 num_passes_remaining = c->params.num_optim_passes;
 {
        u32 num_passes_remaining = c->params.num_optim_passes;
-       const u8 *window_ptr;
-       const u8 *window_end;
        struct xpress_item *next_chosen_item;
        struct xpress_item *next_chosen_item;
-       struct lz_match raw_item;
-       struct xpress_item xpress_item;
-
-       xpress_set_default_costs(c->costs);
-       c->optimum_cur_idx = 0;
-       c->optimum_end_idx = 0;
+       struct xpress_item **next_chosen_item_ptr;
 
 
+       /* Choose appropriate match-finder wrapper functions.  */
        if (c->params.num_optim_passes > 1) {
                c->get_matches_func = xpress_get_matches_fillcache;
                c->skip_bytes_func = xpress_skip_bytes_fillcache;
        if (c->params.num_optim_passes > 1) {
                c->get_matches_func = xpress_get_matches_fillcache;
                c->skip_bytes_func = xpress_skip_bytes_fillcache;
@@ -763,108 +824,76 @@ xpress_choose_items_near_optimal(struct xpress_compressor *c)
                c->skip_bytes_func = xpress_skip_bytes_noncaching;
        }
 
                c->skip_bytes_func = xpress_skip_bytes_noncaching;
        }
 
-       lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size);
+       /* The first optimization pass will use a default cost model.  Each
+        * additional optimization pass will use a cost model computed from the
+        * previous pass.
+        *
+        * To improve performance, we only generate the array containing the
+        * matches and literals in intermediate form on the final pass.  For
+        * earlier passes, tallying symbol frequencies is sufficient.  */
+       xpress_set_default_costs(c->costs);
 
 
-       while (--num_passes_remaining) {
-               c->cur_window_ptr = c->cur_window;
-               window_ptr = c->cur_window;
-               window_end = window_ptr + c->cur_window_size;
+       next_chosen_item_ptr = NULL;
+       do {
+               /* Reset the match-finder wrapper.  */
                c->cache_ptr = c->cached_matches;
                c->cache_ptr = c->cached_matches;
-               memset(c->freqs, 0, sizeof(c->freqs));
-
-               while (window_ptr != window_end) {
-                       raw_item = xpress_choose_near_optimal_item(c);
-                       if (raw_item.len >= XPRESS_MIN_MATCH_LEN) {
-                               xpress_tally_match(raw_item.len,
-                                                  raw_item.offset, c->freqs);
-                               window_ptr += raw_item.len;
-                       } else {
-                               xpress_tally_literal(*window_ptr, c->freqs);
-                               window_ptr += 1;
-                       }
-               }
-               c->freqs[XPRESS_END_OF_DATA]++;
-               xpress_make_huffman_code(c);
-               xpress_set_costs(c->costs, c->lens);
-               if (c->cache_ptr <= c->cache_limit) {
-                       c->get_matches_func = xpress_get_matches_usecache_nocheck;
-                       c->skip_bytes_func = xpress_skip_bytes_usecache_nocheck;
-               } else {
-                       c->get_matches_func = xpress_get_matches_usecache;
-                       c->skip_bytes_func = xpress_skip_bytes_usecache;
-               }
-       }
 
 
-       c->cur_window_ptr = c->cur_window;
-       window_ptr = c->cur_window;
-       window_end = window_ptr + c->cur_window_size;
-       c->cache_ptr = c->cached_matches;
-       memset(c->freqs, 0, sizeof(c->freqs));
-       next_chosen_item = c->chosen_items;
-
-       u32 unseen_cost = 9;
-       while (window_ptr != window_end) {
-               raw_item = xpress_choose_near_optimal_item(c);
-               if (raw_item.len >= XPRESS_MIN_MATCH_LEN) {
-                       xpress_item = xpress_tally_match(raw_item.len,
-                                                        raw_item.offset,
-                                                        c->freqs);
-                       window_ptr += raw_item.len;
-               } else {
-                       xpress_item = xpress_tally_literal(*window_ptr,
-                                                          c->freqs);
-                       window_ptr += 1;
+               if (num_passes_remaining == 1) {
+                       /* Last pass: actually generate the items.  */
+                       next_chosen_item = c->chosen_items;
+                       next_chosen_item_ptr = &next_chosen_item;
                }
                }
-               *next_chosen_item++ = xpress_item;
 
 
-               /* When doing one-pass near-optimal parsing, rebuild the Huffman
-                * code occasionally.  */
-               if (unlikely((next_chosen_item - c->chosen_items) % 2048 == 0) &&
-                   c->cur_window_size >= 16384 &&
-                   c->params.num_optim_passes == 1)
-               {
+               /* Choose the items.  */
+               xpress_optim_pass(c, next_chosen_item_ptr);
+
+               if (num_passes_remaining > 1) {
+                       /* This isn't the last pass.  */
+
+                       /* Make the Huffman code from the symbol frequencies.  */
+                       c->freqs[XPRESS_END_OF_DATA]++;
                        xpress_make_huffman_code(c);
                        xpress_make_huffman_code(c);
-                       for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i++)
-                               c->costs[i] = c->lens[i] ? c->lens[i] : unseen_cost;
-                       if (unseen_cost < 15)
-                               unseen_cost++;
+
+                       /* Reset symbol frequencies.  */
+                       memset(c->freqs, 0, sizeof(c->freqs));
+
+                       /* Update symbol costs.  */
+                       xpress_set_costs(c->costs, c->lens);
+
+                       /* Choose appopriate match-finder wrapper functions.  */
+                       if (c->cache_ptr <= c->cache_limit) {
+                               c->get_matches_func = xpress_get_matches_usecache_nocheck;
+                               c->skip_bytes_func = xpress_skip_bytes_usecache_nocheck;
+                       } else {
+                               c->get_matches_func = xpress_get_matches_usecache;
+                               c->skip_bytes_func = xpress_skip_bytes_usecache;
+                       }
                }
                }
-       }
-       c->freqs[XPRESS_END_OF_DATA]++;
-       xpress_make_huffman_code(c);
+       } while (--num_passes_remaining);
+
+       /* Return the number of items chosen.  */
        return next_chosen_item - c->chosen_items;
 }
 
 /* Lazy parsing  */
 static u32
        return next_chosen_item - c->chosen_items;
 }
 
 /* Lazy parsing  */
 static u32
-xpress_choose_items_lazy(struct xpress_compressor *c)
+xpress_choose_lazy_items(struct xpress_compressor *c)
 {
 {
-       struct lz_mf *mf;
+       const u8 *window_ptr = c->cur_window;
+       const u8 *window_end = &c->cur_window[c->cur_window_size];
+       struct xpress_item *next_chosen_item = c->chosen_items;
        u32 len_3_too_far;
        u32 len_3_too_far;
-       const u8 *window_ptr;
-       const u8 *window_end;
-       u32 num_matches;
-       struct lz_match matches[min(c->params.nice_match_length, c->params.max_search_depth)];
-       struct xpress_item *next_chosen_item;
+       struct lz_mf *mf = c->mf;
+       struct lz_match *matches = c->cached_matches;
+       unsigned num_matches;
        struct lz_match prev_match;
 
        struct lz_match prev_match;
 
-       mf = c->mf;
-
-       lz_mf_load_window(mf, c->cur_window, c->cur_window_size);
-
        if (c->cur_window_size <= 8192)
                len_3_too_far = 2048;
        else
                len_3_too_far = 4096;
 
        if (c->cur_window_size <= 8192)
                len_3_too_far = 2048;
        else
                len_3_too_far = 4096;
 
-       memset(c->freqs, 0, sizeof(c->freqs));
-
-       window_ptr = c->cur_window;
-       window_end = c->cur_window + c->cur_window_size;
-       next_chosen_item = c->chosen_items;
-
-       for (;;) {
-
+       do {
                /* Don't have match at previous position  */
 
                num_matches = lz_mf_get_matches(mf, matches);
                /* Don't have match at previous position  */
 
                num_matches = lz_mf_get_matches(mf, matches);
@@ -875,10 +904,8 @@ xpress_choose_items_lazy(struct xpress_compressor *c)
                     matches[num_matches - 1].offset >= len_3_too_far))
                {
                        /* No matches found => output literal  */
                     matches[num_matches - 1].offset >= len_3_too_far))
                {
                        /* No matches found => output literal  */
-                       *next_chosen_item++ = xpress_tally_literal(*(window_ptr - 1),
-                                                                  c->freqs);
-                       if (window_ptr == window_end)
-                               break;
+                       xpress_declare_literal(c, *(window_ptr - 1),
+                                              &next_chosen_item);
                        continue;
                }
 
                        continue;
                }
 
@@ -889,13 +916,11 @@ xpress_choose_items_lazy(struct xpress_compressor *c)
 
                if (prev_match.len >= c->params.nice_match_length) {
                        /* Very long match found => output immediately  */
 
                if (prev_match.len >= c->params.nice_match_length) {
                        /* Very long match found => output immediately  */
-                       *next_chosen_item++ = xpress_tally_match(prev_match.len,
-                                                                prev_match.offset,
-                                                                c->freqs);
+                       xpress_declare_match(c, prev_match.len,
+                                            prev_match.offset,
+                                            &next_chosen_item);
                        lz_mf_skip_positions(mf, prev_match.len - 1);
                        window_ptr += prev_match.len - 1;
                        lz_mf_skip_positions(mf, prev_match.len - 1);
                        window_ptr += prev_match.len - 1;
-                       if (window_ptr == window_end)
-                               break;
                        continue;
                }
 
                        continue;
                }
 
@@ -906,58 +931,44 @@ xpress_choose_items_lazy(struct xpress_compressor *c)
                    (matches[num_matches - 1].len <= prev_match.len))
                {
                        /* Next match is not longer => output previous match  */
                    (matches[num_matches - 1].len <= prev_match.len))
                {
                        /* Next match is not longer => output previous match  */
-                       *next_chosen_item++ = xpress_tally_match(prev_match.len,
-                                                                prev_match.offset,
-                                                                c->freqs);
+                       xpress_declare_match(c, prev_match.len,
+                                            prev_match.offset,
+                                            &next_chosen_item);
                        lz_mf_skip_positions(mf, prev_match.len - 2);
                        window_ptr += prev_match.len - 2;
                        lz_mf_skip_positions(mf, prev_match.len - 2);
                        window_ptr += prev_match.len - 2;
-                       if (window_ptr == window_end)
-                               break;
                        continue;
                }
 
                /* Next match is longer => output literal  */
 
                        continue;
                }
 
                /* Next match is longer => output literal  */
 
-               *next_chosen_item++ = xpress_tally_literal(*(window_ptr - 2),
-                                                          c->freqs);
+               xpress_declare_literal(c, *(window_ptr - 2), &next_chosen_item);
 
                prev_match = matches[num_matches - 1];
 
                goto have_prev_match;
 
                prev_match = matches[num_matches - 1];
 
                goto have_prev_match;
-       }
 
 
-       c->freqs[XPRESS_END_OF_DATA]++;
-       xpress_make_huffman_code(c);
+       } while (window_ptr != window_end);
+
        return next_chosen_item - c->chosen_items;
 }
 
 /* Greedy parsing  */
 static u32
        return next_chosen_item - c->chosen_items;
 }
 
 /* Greedy parsing  */
 static u32
-xpress_choose_items_greedy(struct xpress_compressor *c)
+xpress_choose_greedy_items(struct xpress_compressor *c)
 {
 {
-       struct lz_mf *mf;
+       const u8 *window_ptr = c->cur_window;
+       const u8 *window_end = &c->cur_window[c->cur_window_size];
+       struct xpress_item *next_chosen_item = c->chosen_items;
        u32 len_3_too_far;
        u32 len_3_too_far;
-       const u8 *window_ptr;
-       const u8 *window_end;
-       struct lz_match matches[min(c->params.nice_match_length, c->params.max_search_depth)];
-       u32 num_matches;
-       struct xpress_item *next_chosen_item;
-
-       mf = c->mf;
-
-       lz_mf_load_window(mf, c->cur_window, c->cur_window_size);
+       struct lz_mf *mf = c->mf;
+       struct lz_match *matches = c->cached_matches;
+       unsigned num_matches;
 
        if (c->cur_window_size <= 8192)
                len_3_too_far = 2048;
        else
                len_3_too_far = 4096;
 
 
        if (c->cur_window_size <= 8192)
                len_3_too_far = 2048;
        else
                len_3_too_far = 4096;
 
-       memset(c->freqs, 0, sizeof(c->freqs));
-
-       window_ptr = c->cur_window;
-       window_end = c->cur_window + c->cur_window_size;
-       next_chosen_item = c->chosen_items;
-
        do {
                /* Get longest match at the current position.  */
                num_matches = lz_mf_get_matches(mf, matches);
        do {
                /* Get longest match at the current position.  */
                num_matches = lz_mf_get_matches(mf, matches);
@@ -966,80 +977,89 @@ xpress_choose_items_greedy(struct xpress_compressor *c)
                    (matches[num_matches - 1].len == 3 &&
                     matches[num_matches - 1].offset >= len_3_too_far))
                {
                    (matches[num_matches - 1].len == 3 &&
                     matches[num_matches - 1].offset >= len_3_too_far))
                {
-                       *next_chosen_item++ = xpress_tally_literal(*window_ptr, c->freqs);
+                       /* No match, or length 3 match with large offset.
+                        * Choose a literal.  */
+                       xpress_declare_literal(c, *window_ptr, &next_chosen_item);
                        window_ptr += 1;
                } else {
                        window_ptr += 1;
                } else {
-                       u32 len = matches[num_matches - 1].len;
-                       u32 offset = matches[num_matches - 1].offset;
+                       /* Match found.  Choose it.  */
+                       unsigned len = matches[num_matches - 1].len;
+                       unsigned offset = matches[num_matches - 1].offset;
 
 
-                       *next_chosen_item++ = xpress_tally_match(len, offset, c->freqs);
+                       xpress_declare_match(c, len, offset, &next_chosen_item);
                        lz_mf_skip_positions(mf, len - 1);
                        window_ptr += len;
                }
        } while (window_ptr != window_end);
 
                        lz_mf_skip_positions(mf, len - 1);
                        window_ptr += len;
                }
        } while (window_ptr != window_end);
 
-       c->freqs[XPRESS_END_OF_DATA]++;
-       xpress_make_huffman_code(c);
        return next_chosen_item - c->chosen_items;
 }
 
        return next_chosen_item - c->chosen_items;
 }
 
-/* Huffman-only parsing  */
+/* Literals-only parsing  */
 static u32
 static u32
-xpress_choose_items_huffonly(struct xpress_compressor *c)
+xpress_choose_literals(struct xpress_compressor *c)
 {
 {
-       const u8 *window_ptr;
-       const u8 *window_end;
-       struct xpress_item *next_chosen_item;
-
-       memset(c->freqs, 0, sizeof(c->freqs));
-
-       window_ptr = c->cur_window;
-       window_end = c->cur_window + c->cur_window_size;
-       next_chosen_item = c->chosen_items;
+       const u8 *window_ptr = c->cur_window;
+       const u8 *window_end = &c->cur_window[c->cur_window_size];
+       struct xpress_item *next_chosen_item = c->chosen_items;
 
        do {
 
        do {
-               *next_chosen_item++ = xpress_tally_literal(*window_ptr++, c->freqs);
+               xpress_declare_literal(c, *window_ptr++, &next_chosen_item);
        } while (window_ptr != window_end);
 
        } while (window_ptr != window_end);
 
-       c->freqs[XPRESS_END_OF_DATA]++;
-       xpress_make_huffman_code(c);
        return next_chosen_item - c->chosen_items;
 }
 
        return next_chosen_item - c->chosen_items;
 }
 
-/* Given the specified compression level and maximum window size, build the
- * parameters to use for XPRESS compression.  */
+/*
+ * 'choose_items_func' is provided a data buffer c->cur_window of length
+ * c->cur_window_size bytes.  This data buffer will have already been loaded
+ * into the match-finder c->mf.  'choose_items_func' must choose the
+ * match/literal sequence to output to represent this data buffer.  The
+ * intermediate representation of this match/literal sequence must be recorded
+ * in c->chosen_items, and the Huffman symbols used must be tallied in c->freqs.
+ * The return value must be the number of items written to c->chosen_items.
+ */
+static u32
+xpress_choose_items(struct xpress_compressor *c)
+{
+       return (*c->params.choose_items_func)(c);
+}
+
+/* Set internal compression parameters for the specified compression level and
+ * maximum window size.  */
 static void
 xpress_build_params(unsigned int compression_level, u32 max_window_size,
                    struct xpress_compressor_params *xpress_params)
 {
        memset(xpress_params, 0, sizeof(*xpress_params));
 static void
 xpress_build_params(unsigned int compression_level, u32 max_window_size,
                    struct xpress_compressor_params *xpress_params)
 {
        memset(xpress_params, 0, sizeof(*xpress_params));
+       xpress_params->num_optim_passes = 1;
 
        if (compression_level == 1) {
 
 
        if (compression_level == 1) {
 
-               /* Huffman only (no Lempel-Ziv matches)  */
+               /* Literal-only parsing  */
+               xpress_params->choose_items_func = xpress_choose_literals;
                xpress_params->mf_algo = LZ_MF_NULL;
                xpress_params->mf_algo = LZ_MF_NULL;
-               xpress_params->choose_items_func = xpress_choose_items_huffonly;
 
        } else if (compression_level < 30) {
 
                /* Greedy parsing  */
 
        } else if (compression_level < 30) {
 
                /* Greedy parsing  */
+               xpress_params->choose_items_func = xpress_choose_greedy_items;
                xpress_params->mf_algo = LZ_MF_HASH_CHAINS;
                xpress_params->mf_algo = LZ_MF_HASH_CHAINS;
-               xpress_params->choose_items_func = xpress_choose_items_greedy;
                xpress_params->nice_match_length = compression_level;
                xpress_params->max_search_depth = compression_level / 2;
 
        } else if (compression_level < 60) {
 
                /* Lazy parsing  */
                xpress_params->nice_match_length = compression_level;
                xpress_params->max_search_depth = compression_level / 2;
 
        } else if (compression_level < 60) {
 
                /* Lazy parsing  */
+               xpress_params->choose_items_func = xpress_choose_lazy_items;
                xpress_params->mf_algo = LZ_MF_HASH_CHAINS;
                xpress_params->mf_algo = LZ_MF_HASH_CHAINS;
-               xpress_params->choose_items_func = xpress_choose_items_lazy;
                xpress_params->nice_match_length = compression_level;
                xpress_params->max_search_depth = compression_level / 2;
 
        } else {
 
                /* Near-optimal parsing  */
                xpress_params->nice_match_length = compression_level;
                xpress_params->max_search_depth = compression_level / 2;
 
        } else {
 
                /* Near-optimal parsing  */
-               xpress_params->choose_items_func = xpress_choose_items_near_optimal;
+               xpress_params->choose_items_func = xpress_choose_near_optimal_items;
                if (max_window_size >= 16384)
                        xpress_params->mf_algo = LZ_MF_BINARY_TREES;
                else
                if (max_window_size >= 16384)
                        xpress_params->mf_algo = LZ_MF_BINARY_TREES;
                else
@@ -1052,8 +1072,8 @@ xpress_build_params(unsigned int compression_level, u32 max_window_size,
        }
 }
 
        }
 }
 
-/* Given the specified XPRESS parameters and maximum window size, build the
- * parameters to use for match-finding.  */
+/* Given the internal compression parameters and maximum window size, build the
+ * Lempel-Ziv match-finder parameters.  */
 static void
 xpress_build_mf_params(const struct xpress_compressor_params *xpress_params,
                       u32 max_window_size, struct lz_mf_params *mf_params)
 static void
 xpress_build_mf_params(const struct xpress_compressor_params *xpress_params,
                       u32 max_window_size, struct lz_mf_params *mf_params)
@@ -1084,20 +1104,25 @@ xpress_get_needed_memory(size_t max_window_size, unsigned int compression_level)
 
        size += sizeof(struct xpress_compressor);
 
 
        size += sizeof(struct xpress_compressor);
 
+       /* mf */
        size += lz_mf_get_needed_memory(params.mf_algo, max_window_size);
 
        size += lz_mf_get_needed_memory(params.mf_algo, max_window_size);
 
-       if (params.choose_items_func == xpress_choose_items_near_optimal) {
+       /* optimum */
+       if (params.choose_items_func == xpress_choose_near_optimal_items) {
                size += (XPRESS_OPTIM_ARRAY_LENGTH + params.nice_match_length) *
                size += (XPRESS_OPTIM_ARRAY_LENGTH + params.nice_match_length) *
-                                     sizeof(struct xpress_mc_pos_data);
-               if (params.num_optim_passes > 1) {
-                       size_t cache_len = max(max_window_size * XPRESS_CACHE_PER_POS,
-                                              params.max_search_depth + 1);
-                       size += cache_len * sizeof(struct lz_match);
-               } else {
-                       size += params.max_search_depth * sizeof(struct lz_match);
-               }
+                       sizeof(struct xpress_mc_pos_data);
+       }
+
+       /* cached_matches */
+       if (params.num_optim_passes > 1) {
+               size_t cache_len = max(max_window_size * XPRESS_CACHE_PER_POS,
+                                      params.max_search_depth + 1);
+               size += cache_len * sizeof(struct lz_match);
+       } else {
+               size += params.max_search_depth * sizeof(struct lz_match);
        }
 
        }
 
+       /* chosen_items */
        size += max_window_size * sizeof(struct xpress_item);
 
        return size;
        size += max_window_size * sizeof(struct xpress_item);
 
        return size;
@@ -1127,26 +1152,27 @@ xpress_create_compressor(size_t max_window_size, unsigned int compression_level,
        if (!c->mf)
                goto oom;
 
        if (!c->mf)
                goto oom;
 
-       if (params.choose_items_func == xpress_choose_items_near_optimal) {
+       if (params.choose_items_func == xpress_choose_near_optimal_items) {
                c->optimum = MALLOC((XPRESS_OPTIM_ARRAY_LENGTH +
                                     params.nice_match_length) *
                                      sizeof(struct xpress_mc_pos_data));
                if (!c->optimum)
                        goto oom;
                c->optimum = MALLOC((XPRESS_OPTIM_ARRAY_LENGTH +
                                     params.nice_match_length) *
                                      sizeof(struct xpress_mc_pos_data));
                if (!c->optimum)
                        goto oom;
-               if (params.num_optim_passes > 1) {
-                       size_t cache_len = max(max_window_size * XPRESS_CACHE_PER_POS,
-                                              params.max_search_depth + 1);
-                       c->cached_matches = MALLOC(cache_len * sizeof(struct lz_match));
-                       if (!c->cached_matches)
-                               goto oom;
-                       c->cache_limit = c->cached_matches + cache_len -
-                                          (params.max_search_depth + 1);
-               } else {
-                       c->cached_matches = MALLOC(params.max_search_depth *
-                                                  sizeof(struct lz_match));
-                       if (!c->cached_matches)
-                               goto oom;
-               }
+       }
+
+       if (params.num_optim_passes > 1) {
+               size_t cache_len = max(max_window_size * XPRESS_CACHE_PER_POS,
+                                      params.max_search_depth + 1);
+               c->cached_matches = MALLOC(cache_len * sizeof(struct lz_match));
+               if (!c->cached_matches)
+                       goto oom;
+               c->cache_limit = c->cached_matches + cache_len -
+                                  (params.max_search_depth + 1);
+       } else {
+               c->cached_matches = MALLOC(params.max_search_depth *
+                                          sizeof(struct lz_match));
+               if (!c->cached_matches)
+                       goto oom;
        }
 
        c->chosen_items = MALLOC(max_window_size * sizeof(struct xpress_item));
        }
 
        c->chosen_items = MALLOC(max_window_size * sizeof(struct xpress_item));
@@ -1177,10 +1203,14 @@ xpress_compress(const void *uncompressed_data, size_t uncompressed_size,
        if (compressed_size_avail < XPRESS_NUM_SYMBOLS / 2 + 50)
                return 0;
 
        if (compressed_size_avail < XPRESS_NUM_SYMBOLS / 2 + 50)
                return 0;
 
-       /* Determine match/literal sequence to divide the data into.  */
+       /* Determine match/literal sequence.  */
        c->cur_window = uncompressed_data;
        c->cur_window_size = uncompressed_size;
        c->cur_window = uncompressed_data;
        c->cur_window_size = uncompressed_size;
-       num_chosen_items = (*c->params.choose_items_func)(c);
+       lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size);
+       memset(c->freqs, 0, sizeof(c->freqs));
+       num_chosen_items = xpress_choose_items(c);
+       c->freqs[XPRESS_END_OF_DATA]++;
+       xpress_make_huffman_code(c);
 
        /* Output the Huffman code as a series of 512 4-bit lengths.  */
        cptr = compressed_data;
 
        /* Output the Huffman code as a series of 512 4-bit lengths.  */
        cptr = compressed_data;