]> wimlib.net Git - wimlib/blobdiff - src/lzx_compress.c
mount_image.c: add fallback definitions of RENAME_* constants
[wimlib] / src / lzx_compress.c
index f0c3c5268c82c5e26347b9c42e5c008e811886c8..02498bf5f6b7b6cc51b6563a7b1fdf2892ffcaa0 100644 (file)
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (C) 2012-2016 Eric Biggers
+ * Copyright (C) 2012-2017 Eric Biggers
  *
  * This file is free software; you can redistribute it and/or modify it under
  * the terms of the GNU Lesser General Public License as published by the Free
@@ -18,7 +18,7 @@
  * details.
  *
  * You should have received a copy of the GNU Lesser General Public License
- * along with this file; if not, see http://www.gnu.org/licenses/.
+ * along with this file; if not, see https://www.gnu.org/licenses/.
  */
 
 
 #include "wimlib/compress_common.h"
 #include "wimlib/compressor_ops.h"
 #include "wimlib/error.h"
-#include "wimlib/lz_extend.h"
 #include "wimlib/lzx_common.h"
 #include "wimlib/unaligned.h"
 #include "wimlib/util.h"
@@ -263,22 +262,32 @@ struct lzx_block_split_stats {
  */
 struct lzx_sequence {
 
-       /* The number of literals in the run.  This may be 0.  The literals are
-        * not stored explicitly in this structure; instead, they are read
-        * directly from the uncompressed data.  */
-       u16 litrunlen;
-
-       /* If the next field doesn't indicate end-of-block, then this is the
-        * match length minus LZX_MIN_MATCH_LEN.  */
-       u16 adjusted_length;
-
-       /* If bit 31 is clear, then this field contains the match header in bits
-        * 0-8, and either the match offset plus LZX_OFFSET_ADJUSTMENT or a
-        * recent offset code in bits 9-30.  Otherwise (if bit 31 is set), this
-        * sequence's literal run was the last literal run in the block, so
-        * there is no match that follows it.  */
-       u32 adjusted_offset_and_match_hdr;
-};
+       /*
+        * Bits 9..31: the number of literals in this run.  This may be 0 and
+        * can be at most about SOFT_MAX_BLOCK_LENGTH.  The literals are not
+        * stored explicitly in this structure; instead, they are read directly
+        * from the uncompressed data.
+        *
+        * Bits 0..8: the length of the match which follows the literals, or 0
+        * if this literal run was the last in the block, so there is no match
+        * which follows it.  This can be at most LZX_MAX_MATCH_LEN.
+        */
+       u32 litrunlen_and_matchlen;
+#define SEQ_MATCHLEN_BITS      9
+#define SEQ_MATCHLEN_MASK      (((u32)1 << SEQ_MATCHLEN_BITS) - 1)
+
+       /*
+        * If 'matchlen' doesn't indicate end-of-block, then this contains:
+        *
+        * Bits 10..31: either the offset plus LZX_OFFSET_ADJUSTMENT or a recent
+        * offset code, depending on the offset slot encoded in the main symbol.
+        *
+        * Bits 0..9: the main symbol.
+        */
+       u32 adjusted_offset_and_mainsym;
+#define SEQ_MAINSYM_BITS       10
+#define SEQ_MAINSYM_MASK       (((u32)1 << SEQ_MAINSYM_BITS) - 1)
+} __attribute__((aligned(8)));
 
 /*
  * This structure represents a byte position in the input buffer and a node in
@@ -318,13 +327,13 @@ struct lzx_optimum_node {
         * behavior applies --- see the code.
         */
        u32 item;
-#define OPTIMUM_OFFSET_SHIFT 9
-#define OPTIMUM_LEN_MASK ((1 << OPTIMUM_OFFSET_SHIFT) - 1)
+#define OPTIMUM_OFFSET_SHIFT   SEQ_MATCHLEN_BITS
+#define OPTIMUM_LEN_MASK       SEQ_MATCHLEN_MASK
 #if CONSIDER_GAP_MATCHES
 #  define OPTIMUM_GAP_MATCH 0x80000000
 #endif
 
-} _aligned_attribute(8);
+} __attribute__((aligned(8)));
 
 /* The cost model for near-optimal parsing */
 struct lzx_costs {
@@ -488,7 +497,7 @@ struct lzx_compressor {
  * This requires that the limit be no more than the length of offset_slot_tab_1
  * (currently 32768).
  */
-static inline bool
+static forceinline bool
 lzx_is_16_bit(size_t max_bufsize)
 {
        STATIC_ASSERT(ARRAY_LEN(((struct lzx_compressor *)0)->offset_slot_tab_1) == 32768);
@@ -498,15 +507,43 @@ lzx_is_16_bit(size_t max_bufsize)
 /*
  * Return the offset slot for the specified adjusted match offset.
  */
-static inline unsigned
+static forceinline unsigned
 lzx_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset,
                    bool is_16_bit)
 {
+       if (__builtin_constant_p(adjusted_offset) &&
+           adjusted_offset < LZX_NUM_RECENT_OFFSETS)
+               return adjusted_offset;
        if (is_16_bit || adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1))
                return c->offset_slot_tab_1[adjusted_offset];
        return c->offset_slot_tab_2[adjusted_offset >> 14];
 }
 
+/*
+ * For a match that has the specified length and adjusted offset, tally its main
+ * symbol, and if needed its length symbol; then return its main symbol.
+ */
+static forceinline unsigned
+lzx_tally_main_and_lensyms(struct lzx_compressor *c, unsigned length,
+                          u32 adjusted_offset, bool is_16_bit)
+{
+       unsigned mainsym;
+
+       if (length >= LZX_MIN_SECONDARY_LEN) {
+               /* Length symbol needed */
+               c->freqs.len[length - LZX_MIN_SECONDARY_LEN]++;
+               mainsym = LZX_NUM_CHARS + LZX_NUM_PRIMARY_LENS;
+       } else {
+               /* No length symbol needed */
+               mainsym = LZX_NUM_CHARS + length - LZX_MIN_MATCH_LEN;
+       }
+
+       mainsym += LZX_NUM_LEN_HEADERS *
+                  lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
+       c->freqs.main[mainsym]++;
+       return mainsym;
+}
+
 /*
  * The following macros call either the 16-bit or the 32-bit version of a
  * matchfinder function based on the value of 'is_16_bit', which will be known
@@ -574,7 +611,7 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
  * Add some bits to the bitbuffer variable of the output bitstream.  The caller
  * must make sure there is enough room.
  */
-static inline void
+static forceinline void
 lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 {
        os->bitbuf = (os->bitbuf << num_bits) | bits;
@@ -586,7 +623,7 @@ lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
  * specifies the maximum number of bits that may have been added since the last
  * flush.
  */
-static inline void
+static forceinline void
 lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
 {
        /* Masking the number of bits to shift is only needed to avoid undefined
@@ -609,7 +646,7 @@ lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
 }
 
 /* Add at most 16 bits to the bitbuffer and flush it.  */
-static inline void
+static forceinline void
 lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 {
        lzx_add_bits(os, bits, num_bits);
@@ -889,16 +926,22 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
                    const struct lzx_codes *codes)
 {
        const struct lzx_sequence *seq = sequences;
-       u32 ones_if_aligned = 0 - (block_type == LZX_BLOCKTYPE_ALIGNED);
+       unsigned min_aligned_offset_slot;
+
+       if (block_type == LZX_BLOCKTYPE_ALIGNED)
+               min_aligned_offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT;
+       else
+               min_aligned_offset_slot = LZX_MAX_OFFSET_SLOTS;
 
        for (;;) {
                /* Output the next sequence.  */
 
-               unsigned litrunlen = seq->litrunlen;
-               unsigned match_hdr;
-               unsigned main_symbol;
-               unsigned adjusted_length;
+               u32 litrunlen = seq->litrunlen_and_matchlen >> SEQ_MATCHLEN_BITS;
+               unsigned matchlen = seq->litrunlen_and_matchlen & SEQ_MATCHLEN_MASK;
+               STATIC_ASSERT((u32)~SEQ_MATCHLEN_MASK >> SEQ_MATCHLEN_BITS >=
+                             SOFT_MAX_BLOCK_SIZE);
                u32 adjusted_offset;
+               unsigned main_symbol;
                unsigned offset_slot;
                unsigned num_extra_bits;
                u32 extra_bits;
@@ -953,25 +996,26 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
                }
 
                /* Was this the last literal run?  */
-               if (seq->adjusted_offset_and_match_hdr & 0x80000000)
+               if (matchlen == 0)
                        return;
 
                /* Nope; output the match.  */
 
-               match_hdr = seq->adjusted_offset_and_match_hdr & 0x1FF;
-               main_symbol = LZX_NUM_CHARS + match_hdr;
-               adjusted_length = seq->adjusted_length;
-
-               block_data += adjusted_length + LZX_MIN_MATCH_LEN;
+               block_data += matchlen;
 
-               offset_slot = match_hdr / LZX_NUM_LEN_HEADERS;
-               adjusted_offset = seq->adjusted_offset_and_match_hdr >> 9;
+               adjusted_offset = seq->adjusted_offset_and_mainsym >> SEQ_MAINSYM_BITS;
+               main_symbol = seq->adjusted_offset_and_mainsym & SEQ_MAINSYM_MASK;
 
+               offset_slot = (main_symbol - LZX_NUM_CHARS) / LZX_NUM_LEN_HEADERS;
                num_extra_bits = lzx_extra_offset_bits[offset_slot];
-               extra_bits = adjusted_offset - lzx_offset_slot_base[offset_slot];
+               extra_bits = adjusted_offset - (lzx_offset_slot_base[offset_slot] +
+                                               LZX_OFFSET_ADJUSTMENT);
 
-       #define MAX_MATCH_BITS  (MAIN_CODEWORD_LIMIT + LENGTH_CODEWORD_LIMIT + \
-                                14 + ALIGNED_CODEWORD_LIMIT)
+       #define MAX_MATCH_BITS (MAIN_CODEWORD_LIMIT +           \
+                               LENGTH_CODEWORD_LIMIT +         \
+                               LZX_MAX_NUM_EXTRA_BITS -        \
+                               LZX_NUM_ALIGNED_OFFSET_BITS +   \
+                               ALIGNED_CODEWORD_LIMIT)
 
                /* Verify optimization is enabled on 64-bit  */
                STATIC_ASSERT(WORDBITS < 64 || CAN_BUFFER(MAX_MATCH_BITS));
@@ -985,11 +1029,11 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 
                /* If needed, output the length symbol for the match.  */
 
-               if (adjusted_length >= LZX_NUM_PRIMARY_LENS) {
-                       lzx_add_bits(os, codes->codewords.len[adjusted_length -
-                                                             LZX_NUM_PRIMARY_LENS],
-                                    codes->lens.len[adjusted_length -
-                                                    LZX_NUM_PRIMARY_LENS]);
+               if (matchlen >= LZX_MIN_SECONDARY_LEN) {
+                       lzx_add_bits(os, codes->codewords.len[matchlen -
+                                                             LZX_MIN_SECONDARY_LEN],
+                                    codes->lens.len[matchlen -
+                                                    LZX_MIN_SECONDARY_LEN]);
                        if (!CAN_BUFFER(MAX_MATCH_BITS))
                                lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
                }
@@ -1000,12 +1044,13 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
                 * there are at least extra 3 offset bits required.  All other
                 * extra offset bits are output verbatim.  */
 
-               if ((adjusted_offset & ones_if_aligned) >= 16) {
+               if (offset_slot >= min_aligned_offset_slot) {
 
                        lzx_add_bits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS,
                                     num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS);
                        if (!CAN_BUFFER(MAX_MATCH_BITS))
-                               lzx_flush_bits(os, 14);
+                               lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS -
+                                                  LZX_NUM_ALIGNED_OFFSET_BITS);
 
                        lzx_add_bits(os, codes->codewords.aligned[adjusted_offset &
                                                                  LZX_ALIGNED_OFFSET_BITMASK],
@@ -1014,11 +1059,11 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
                        if (!CAN_BUFFER(MAX_MATCH_BITS))
                                lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
                } else {
-                       STATIC_ASSERT(CAN_BUFFER(17));
+                       STATIC_ASSERT(CAN_BUFFER(LZX_MAX_NUM_EXTRA_BITS));
 
                        lzx_add_bits(os, extra_bits, num_extra_bits);
                        if (!CAN_BUFFER(MAX_MATCH_BITS))
-                               lzx_flush_bits(os, 17);
+                               lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS);
                }
 
                if (CAN_BUFFER(MAX_MATCH_BITS))
@@ -1181,7 +1226,7 @@ lzx_flush_block(struct lzx_compressor *c, struct lzx_output_bitstream *os,
  * but rather we combine many symbols into a single "observation type".  For
  * literals we only look at the high bits and low bits, and for matches we only
  * look at whether the match is long or not.  The assumption is that for typical
- * "real" data, places that are good block boundaries will tend to be noticable
+ * "real" data, places that are good block boundaries will tend to be noticeable
  * based only on changes in these aggregate frequencies, without looking for
  * subtle differences in individual symbols.  For example, a change from ASCII
  * bytes to non-ASCII bytes, or from few matches (generally less compressible)
@@ -1208,7 +1253,7 @@ lzx_init_block_split_stats(struct lzx_block_split_stats *stats)
 
 /* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
  * literal, for 8 possible literal observation types.  */
-static inline void
+static forceinline void
 lzx_observe_literal(struct lzx_block_split_stats *stats, u8 lit)
 {
        stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
@@ -1217,7 +1262,7 @@ lzx_observe_literal(struct lzx_block_split_stats *stats, u8 lit)
 
 /* Match observation.  Heuristic: use one observation type for "short match" and
  * one observation type for "long match".  */
-static inline void
+static forceinline void
 lzx_observe_match(struct lzx_block_split_stats *stats, unsigned length)
 {
        stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
@@ -1270,7 +1315,7 @@ lzx_should_end_block(struct lzx_block_split_stats *stats)
  */
 struct lzx_lru_queue {
        u64 R;
-} _aligned_attribute(8);
+} __attribute__((aligned(8)));
 
 #define LZX_QUEUE_OFFSET_SHIFT 21
 #define LZX_QUEUE_OFFSET_MASK  (((u64)1 << LZX_QUEUE_OFFSET_SHIFT) - 1)
@@ -1288,26 +1333,26 @@ struct lzx_lru_queue {
        ((u64)1 << LZX_QUEUE_R1_SHIFT) |        \
        ((u64)1 << LZX_QUEUE_R2_SHIFT) }
 
-static inline u64
+static forceinline u64
 lzx_lru_queue_R0(struct lzx_lru_queue queue)
 {
        return (queue.R >> LZX_QUEUE_R0_SHIFT) & LZX_QUEUE_OFFSET_MASK;
 }
 
-static inline u64
+static forceinline u64
 lzx_lru_queue_R1(struct lzx_lru_queue queue)
 {
        return (queue.R >> LZX_QUEUE_R1_SHIFT) & LZX_QUEUE_OFFSET_MASK;
 }
 
-static inline u64
+static forceinline u64
 lzx_lru_queue_R2(struct lzx_lru_queue queue)
 {
        return (queue.R >> LZX_QUEUE_R2_SHIFT) & LZX_QUEUE_OFFSET_MASK;
 }
 
 /* Push a match offset onto the front (most recently used) end of the queue.  */
-static inline struct lzx_lru_queue
+static forceinline struct lzx_lru_queue
 lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
 {
        return (struct lzx_lru_queue) {
@@ -1316,7 +1361,7 @@ lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
 }
 
 /* Swap a match offset to the front of the queue.  */
-static inline struct lzx_lru_queue
+static forceinline struct lzx_lru_queue
 lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
 {
        unsigned shift = idx * 21;
@@ -1330,31 +1375,31 @@ lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
        };
 }
 
-static inline u32
+static forceinline u32
 lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
                   bool record)
 {
+       struct lzx_sequence *seq =
+               &c->chosen_sequences[ARRAY_LEN(c->chosen_sequences) - 1];
        u32 node_idx = block_size;
-       u32 seq_idx = ARRAY_LEN(c->chosen_sequences) - 1;
-       u32 lit_start_node;
+       u32 litrun_end; /* if record=true: end of the current literal run */
 
        if (record) {
-               /* Special value to mark last sequence  */
-               c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = 0x80000000;
-               lit_start_node = node_idx;
+               /* The last sequence has matchlen 0 */
+               seq->litrunlen_and_matchlen = 0;
+               litrun_end = node_idx;
        }
 
        for (;;) {
                u32 item;
-               u32 len;
+               unsigned matchlen;
                u32 adjusted_offset;
-               unsigned v;
-               unsigned offset_slot;
+               unsigned mainsym;
 
                /* Tally literals until either a match or the beginning of the
                 * block is reached.  Note: the item in the node at the
-                * beginning of the block has all bits set, causing this loop to
-                * end when it is reached. */
+                * beginning of the block (c->optimum_nodes[0]) has all bits
+                * set, causing this loop to end when it is reached. */
                for (;;) {
                        item = c->optimum_nodes[node_idx].item;
                        if (item & OPTIMUM_LEN_MASK)
@@ -1365,31 +1410,22 @@ lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
 
        #if CONSIDER_GAP_MATCHES
                if (item & OPTIMUM_GAP_MATCH) {
-
                        if (node_idx == 0)
                                break;
-
-                       /* Record the literal run length for the next sequence
-                        * (the "previous sequence" when walking backwards). */
-                       len = item & OPTIMUM_LEN_MASK;
+                       /* Tally/record the rep0 match after the gap. */
+                       matchlen = item & OPTIMUM_LEN_MASK;
+                       mainsym = lzx_tally_main_and_lensyms(c, matchlen, 0,
+                                                            is_16_bit);
                        if (record) {
-                               c->chosen_sequences[seq_idx--].litrunlen =
-                                               lit_start_node - node_idx;
-                               lit_start_node = node_idx - len;
+                               seq->litrunlen_and_matchlen |=
+                                       (litrun_end - node_idx) <<
+                                        SEQ_MATCHLEN_BITS;
+                               seq--;
+                               seq->litrunlen_and_matchlen = matchlen;
+                               seq->adjusted_offset_and_mainsym = mainsym;
+                               litrun_end = node_idx - matchlen;
                        }
 
-                       /* Tally the rep0 match after the gap. */
-                       v = len - LZX_MIN_MATCH_LEN;
-                       if (record)
-                               c->chosen_sequences[seq_idx].adjusted_length = v;
-                       if (v >= LZX_NUM_PRIMARY_LENS) {
-                               c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
-                               v = LZX_NUM_PRIMARY_LENS;
-                       }
-                       c->freqs.main[LZX_NUM_CHARS + v]++;
-                       if (record)
-                               c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = v;
-
                        /* Tally the literal in the gap. */
                        c->freqs.main[(u8)(item >> OPTIMUM_OFFSET_SHIFT)]++;
 
@@ -1397,62 +1433,43 @@ lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
                         * (It was temporarily saved in the 'cost' field of the
                         * previous node, which was free to reuse.) */
                        item = c->optimum_nodes[--node_idx].cost;
-                       node_idx -= len;
+                       node_idx -= matchlen;
                }
        #else /* CONSIDER_GAP_MATCHES */
                if (node_idx == 0)
                        break;
        #endif /* !CONSIDER_GAP_MATCHES */
 
-               len = item & OPTIMUM_LEN_MASK;
+               /* Tally/record a match. */
+               matchlen = item & OPTIMUM_LEN_MASK;
                adjusted_offset = item >> OPTIMUM_OFFSET_SHIFT;
-
-               /* Record the literal run length for the next sequence (the
-                * "previous sequence" when walking backwards). */
-               if (record) {
-                       c->chosen_sequences[seq_idx--].litrunlen =
-                                       lit_start_node - node_idx;
-                       node_idx -= len;
-                       lit_start_node = node_idx;
-               } else {
-                       node_idx -= len;
-               }
-
-               /* Record a match. */
-
-               /* Tally the aligned offset symbol if needed. */
-               if (adjusted_offset >= 16)
-                       c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++;
-
-               /* Record the adjusted length. */
-               v = len - LZX_MIN_MATCH_LEN;
-               if (record)
-                       c->chosen_sequences[seq_idx].adjusted_length = v;
-
-               /* Tally the length symbol if needed. */
-               if (v >= LZX_NUM_PRIMARY_LENS) {
-                       c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
-                       v = LZX_NUM_PRIMARY_LENS;
-               }
-
-               /* Tally the main symbol. */
-               offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
-               v += offset_slot * LZX_NUM_LEN_HEADERS;
-               c->freqs.main[LZX_NUM_CHARS + v]++;
-
-               /* Record the adjusted offset and match header. */
+               mainsym = lzx_tally_main_and_lensyms(c, matchlen,
+                                                    adjusted_offset,
+                                                    is_16_bit);
+               if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET +
+                                      LZX_OFFSET_ADJUSTMENT)
+                       c->freqs.aligned[adjusted_offset &
+                                        LZX_ALIGNED_OFFSET_BITMASK]++;
                if (record) {
-                       c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr =
-                                       (adjusted_offset << 9) | v;
+                       seq->litrunlen_and_matchlen |=
+                               (litrun_end - node_idx) << SEQ_MATCHLEN_BITS;
+                       seq--;
+                       seq->litrunlen_and_matchlen = matchlen;
+                       seq->adjusted_offset_and_mainsym =
+                               (adjusted_offset << SEQ_MAINSYM_BITS) | mainsym;
+                       litrun_end = node_idx - matchlen;
                }
+               node_idx -= matchlen;
        }
 
        /* Record the literal run length for the first sequence. */
-       if (record)
-               c->chosen_sequences[seq_idx].litrunlen = lit_start_node - node_idx;
+       if (record) {
+               seq->litrunlen_and_matchlen |=
+                       (litrun_end - node_idx) << SEQ_MATCHLEN_BITS;
+       }
 
        /* Return the index in chosen_sequences at which the sequences begin. */
-       return seq_idx;
+       return seq - &c->chosen_sequences[0];
 }
 
 /*
@@ -1465,7 +1482,7 @@ lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
  * beginning of the block), but this doesn't matter because this function only
  * computes frequencies.
  */
-static inline void
+static forceinline void
 lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 {
        lzx_walk_item_list(c, block_size, is_16_bit, false);
@@ -1480,7 +1497,7 @@ lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
  * first-to-last order.  The return value is the index in c->chosen_sequences at
  * which the lzx_sequences begin.
  */
-static inline u32
+static forceinline u32
 lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 {
        return lzx_walk_item_list(c, block_size, is_16_bit, true);
@@ -1520,7 +1537,7 @@ lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
  * one step ahead, with the exception of special consideration for "gap
  * matches".
  */
-static inline struct lzx_lru_queue
+static forceinline struct lzx_lru_queue
 lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
                       const u8 * const restrict block_begin,
                       const u32 block_size,
@@ -1703,7 +1720,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
                                u32 cost;
 
                        #if CONSIDER_ALIGNED_COSTS
-                               if (offset >= 16 - LZX_OFFSET_ADJUSTMENT)
+                               if (offset >= LZX_MIN_ALIGNED_OFFSET)
                                        base_cost += c->costs.aligned[adjusted_offset &
                                                                      LZX_ALIGNED_OFFSET_BITMASK];
                        #endif
@@ -1844,7 +1861,7 @@ lzx_compute_match_costs(struct lzx_compressor *c)
                unsigned i;
 
        #if CONSIDER_ALIGNED_COSTS
-               if (offset_slot >= 8)
+               if (offset_slot >= LZX_MIN_ALIGNED_OFFSET_SLOT)
                        extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS * BIT_COST;
        #endif
 
@@ -1910,12 +1927,17 @@ lzx_cost_for_probability(float prob)
         *      entropy = -log2(probability)
         *
         * Use this to get the cost in fractional bits.  Then multiply by our
-        * scaling factor of BIT_COST and truncate to a u32.
+        * scaling factor of BIT_COST and convert to an integer.
         *
         * In addition, the minimum cost is BIT_COST (one bit) because the
         * entropy coding method will be Huffman codes.
+        *
+        * Careful: even though 'prob' should be <= 1.0, 'log2f_fast(prob)' may
+        * be positive due to inaccuracy in our log2 approximation.  Therefore,
+        * we cannot, in general, assume the computed cost is non-negative, and
+        * we should make sure negative costs get rounded up correctly.
         */
-       u32 cost = -log2f_fast(prob) * BIT_COST;
+       s32 cost = -log2f_fast(prob) * BIT_COST;
        return max(cost, BIT_COST);
 }
 
@@ -2085,7 +2107,7 @@ lzx_set_costs_from_codes(struct lzx_compressor *c)
  * for the block uses default costs; additional passes use costs derived from
  * the Huffman codes computed in the previous pass.
  */
-static inline struct lzx_lru_queue
+static forceinline struct lzx_lru_queue
 lzx_optimize_and_flush_block(struct lzx_compressor * const restrict c,
                             struct lzx_output_bitstream * const restrict os,
                             const u8 * const restrict block_begin,
@@ -2134,7 +2156,7 @@ lzx_optimize_and_flush_block(struct lzx_compressor * const restrict c,
  * time, but rather to produce a compression ratio significantly better than a
  * simpler "greedy" or "lazy" parse while still being relatively fast.
  */
-static inline void
+static forceinline void
 lzx_compress_near_optimal(struct lzx_compressor * restrict c,
                          const u8 * const restrict in_begin, size_t in_nbytes,
                          struct lzx_output_bitstream * restrict os,
@@ -2250,7 +2272,7 @@ lzx_compress_near_optimal(struct lzx_compressor * restrict c,
                        } else {
                                /* Don't search for matches at this position. */
                                CALL_BT_MF(is_16_bit, c,
-                                          bt_matchfinder_skip_position,
+                                          bt_matchfinder_skip_byte,
                                           in_begin,
                                           in_next - in_begin,
                                           nice_len,
@@ -2339,7 +2361,7 @@ lzx_compress_near_optimal_32(struct lzx_compressor *c, const u8 *in,
  * Huffman symbol for the literal, increments the current literal run length,
  * and "observes" the literal for the block split statistics.
  */
-static inline void
+static forceinline void
 lzx_choose_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
 {
        lzx_observe_literal(&c->split_stats, literal);
@@ -2353,41 +2375,22 @@ lzx_choose_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
  * literal run, updates the recent offsets queue, and "observes" the match for
  * the block split statistics.
  */
-static inline void
+static forceinline void
 lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 adjusted_offset,
                 u32 recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit,
                 u32 *litrunlen_p, struct lzx_sequence **next_seq_p)
 {
-       u32 litrunlen = *litrunlen_p;
        struct lzx_sequence *next_seq = *next_seq_p;
-       unsigned offset_slot;
-       unsigned v;
+       unsigned mainsym;
 
        lzx_observe_match(&c->split_stats, length);
 
-       v = length - LZX_MIN_MATCH_LEN;
-
-       /* Save the literal run length and adjusted length. */
-       next_seq->litrunlen = litrunlen;
-       next_seq->adjusted_length = v;
-
-       /* Compute the length header, then tally the length symbol if needed. */
-       if (v >= LZX_NUM_PRIMARY_LENS) {
-               c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
-               v = LZX_NUM_PRIMARY_LENS;
-       }
-
-       /* Compute the offset slot. */
-       offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
-
-       /* Compute the match header. */
-       v += offset_slot * LZX_NUM_LEN_HEADERS;
-
-       /* Save the adjusted offset and match header. */
-       next_seq->adjusted_offset_and_match_hdr = (adjusted_offset << 9) | v;
-
-       /* Tally the main symbol. */
-       c->freqs.main[LZX_NUM_CHARS + v]++;
+       mainsym = lzx_tally_main_and_lensyms(c, length, adjusted_offset,
+                                            is_16_bit);
+       next_seq->litrunlen_and_matchlen =
+               (*litrunlen_p << SEQ_MATCHLEN_BITS) | length;
+       next_seq->adjusted_offset_and_mainsym =
+               (adjusted_offset << SEQ_MAINSYM_BITS) | mainsym;
 
        /* Update the recent offsets queue. */
        if (adjusted_offset < LZX_NUM_RECENT_OFFSETS) {
@@ -2397,7 +2400,7 @@ lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 adjusted_offset,
                /* Explicit offset match. */
 
                /* Tally the aligned offset symbol if needed. */
-               if (adjusted_offset >= 16)
+               if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET + LZX_OFFSET_ADJUSTMENT)
                        c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++;
 
                recent_offsets[2] = recent_offsets[1];
@@ -2415,13 +2418,10 @@ lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 adjusted_offset,
  * which is just a literal run with no following match.  This literal run might
  * be empty.
  */
-static inline void
+static forceinline void
 lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
 {
-       last_seq->litrunlen = litrunlen;
-
-       /* Special value to mark last sequence */
-       last_seq->adjusted_offset_and_match_hdr = 0x80000000;
+       last_seq->litrunlen_and_matchlen = litrunlen << SEQ_MATCHLEN_BITS;
 }
 
 /*
@@ -2482,7 +2482,7 @@ lzx_find_longest_repeat_offset_match(const u8 * const in_next,
  * offset matches, since those require fewer bits to encode.
  */
 
-static inline unsigned
+static forceinline unsigned
 lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
 {
        unsigned score = len;
@@ -2495,7 +2495,7 @@ lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
        return score;
 }
 
-static inline unsigned
+static forceinline unsigned
 lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
 {
        return rep_len + 3;
@@ -2513,7 +2513,7 @@ lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
  * when we decide whether a match is "better" than another, we take the offset
  * into consideration as well as the length.
  */
-static inline void
+static forceinline void
 lzx_compress_lazy(struct lzx_compressor * restrict c,
                  const u8 * const restrict in_begin, size_t in_nbytes,
                  struct lzx_output_bitstream * restrict os, bool is_16_bit)
@@ -2568,7 +2568,7 @@ lzx_compress_lazy(struct lzx_compressor * restrict c,
                        cur_len = CALL_HC_MF(is_16_bit, c,
                                             hc_matchfinder_longest_match,
                                             in_begin,
-                                            in_next - in_begin,
+                                            in_next,
                                             2,
                                             max_len,
                                             nice_len,
@@ -2645,7 +2645,7 @@ lzx_compress_lazy(struct lzx_compressor * restrict c,
                        next_len = CALL_HC_MF(is_16_bit, c,
                                              hc_matchfinder_longest_match,
                                              in_begin,
-                                             in_next - in_begin,
+                                             in_next,
                                              cur_len - 2,
                                              max_len,
                                              nice_len,
@@ -2706,13 +2706,14 @@ lzx_compress_lazy(struct lzx_compressor * restrict c,
                        lzx_choose_match(c, cur_len, cur_adjusted_offset,
                                         recent_offsets, is_16_bit,
                                         &litrunlen, &next_seq);
-                       in_next = CALL_HC_MF(is_16_bit, c,
-                                            hc_matchfinder_skip_positions,
-                                            in_begin,
-                                            in_next - in_begin,
-                                            in_end - in_begin,
-                                            skip_len,
-                                            next_hashes);
+                       CALL_HC_MF(is_16_bit, c,
+                                  hc_matchfinder_skip_bytes,
+                                  in_begin,
+                                  in_next,
+                                  in_end,
+                                  skip_len,
+                                  next_hashes);
+                       in_next += skip_len;
 
                        /* Keep going until it's time to end the block. */
                } while (in_next < in_max_block_end &&
@@ -2762,7 +2763,8 @@ lzx_init_offset_slot_tabs(struct lzx_compressor *c)
        for (; adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1);
             adjusted_offset++)
        {
-               if (adjusted_offset >= lzx_offset_slot_base[slot + 1])
+               if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
+                                      LZX_OFFSET_ADJUSTMENT)
                        slot++;
                c->offset_slot_tab_1[adjusted_offset] = slot;
        }
@@ -2771,7 +2773,8 @@ lzx_init_offset_slot_tabs(struct lzx_compressor *c)
        for (; adjusted_offset < LZX_MAX_WINDOW_SIZE;
             adjusted_offset += (u32)1 << 14)
        {
-               if (adjusted_offset >= lzx_offset_slot_base[slot + 1])
+               if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
+                                      LZX_OFFSET_ADJUSTMENT)
                        slot++;
                c->offset_slot_tab_2[adjusted_offset >> 14] = slot;
        }