lzx_compress: optimize storing information in lzx_sequence

[wimlib] / src / lzms_compress.c
diff --git a/src/lzms_compress.c b/src/lzms_compress.c

index caa93130c75ca44f656a1d8464621985d6eb3bd1..09999957278befa0018c86f670506fba0cdb3120 100644 (file)
--- a/src/lzms_compress.c
+++ b/src/lzms_compress.c
@@ -83,13 +83,13 @@ struct lzms_output_bitstream {
         /* Number of bits currently held in @bitbuf  */
         unsigned bitcount;
  
-       /* Pointer to one past the next position in the output buffer at which
-        * to output a 16-bit coding unit  */
-       le16 *next;
-
         /* Pointer to the beginning of the output buffer (this is the "end" when
          * writing backwards!)  */
-       le16 *begin;
+       u8 *begin;
+
+       /* Pointer to just past the next position in the output buffer at which
+        * to output a 16-bit coding unit  */
+       u8 *next;
  };
  
  /* This structure tracks the state of range encoding and its output, which
@@ -112,14 +112,14 @@ struct lzms_range_encoder {
         u32 cache_size;
  
         /* Pointer to the beginning of the output buffer  */
-       le16 *begin;
+       u8 *begin;
  
         /* Pointer to the position in the output buffer at which the next coding
          * unit must be written  */
-       le16 *next;
+       u8 *next;
  
         /* Pointer to just past the end of the output buffer  */
-       le16 *end;
+       u8 *end;
  };
  
  /* Bookkeeping information for an adaptive Huffman code  */
@@ -170,10 +170,10 @@ struct lzms_item {
  #define DELTA_SOURCE_POWER_SHIFT       28
  #define DELTA_SOURCE_RAW_OFFSET_MASK   (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1)
  
-static inline void
+static _unused_attribute void
  check_that_powers_fit_in_bitfield(void)
  {
-       BUILD_BUG_ON(LZMS_NUM_DELTA_POWER_SYMS > (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
+       STATIC_ASSERT(LZMS_NUM_DELTA_POWER_SYMS <= (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
  }
  
  /* A stripped-down version of the adaptive state in LZMS which excludes the
@@ -249,7 +249,7 @@ struct lzms_optimum_node {
          *
          * Note: this adaptive state structure also does not include the
          * probability entries or current Huffman codewords.  Those aren't
-        * maintained per-position and are only updated occassionally.
+        * maintained per-position and are only updated occasionally.
          */
         struct lzms_adaptive_state state;
  } _aligned_attribute(64);
@@ -307,7 +307,8 @@ struct lzms_compressor {
         u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
  
         /* The per-byte graph nodes for near-optimal parsing  */
-       struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH];
+       struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH +
+                                              1 + MAX_FAST_LENGTH];
  
         /* Table: length => current cost for small match lengths  */
         u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
@@ -327,14 +328,7 @@ struct lzms_compressor {
         unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
         unsigned delta_state;
         unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
-       struct lzms_probability_entry main_probs[LZMS_NUM_MAIN_PROBS];
-       struct lzms_probability_entry match_probs[LZMS_NUM_MATCH_PROBS];
-       struct lzms_probability_entry lz_probs[LZMS_NUM_LZ_PROBS];
-       struct lzms_probability_entry lz_rep_probs[LZMS_NUM_LZ_REP_DECISIONS]
-                                                 [LZMS_NUM_LZ_REP_PROBS];
-       struct lzms_probability_entry delta_probs[LZMS_NUM_DELTA_PROBS];
-       struct lzms_probability_entry delta_rep_probs[LZMS_NUM_DELTA_REP_DECISIONS]
-                                                    [LZMS_NUM_DELTA_REP_PROBS];
+       struct lzms_probabilites probs;
  
         /* Huffman codes  */
  
@@ -433,7 +427,7 @@ lzms_init_offset_slot_tabs(struct lzms_compressor *c)
   * Return the length slot for the specified match length, using the compressor's
   * acceleration table if the length is small enough.
   */
-static inline unsigned
+static forceinline unsigned
  lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
  {
         if (likely(length <= MAX_FAST_LENGTH))
@@ -445,7 +439,7 @@ lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
   * Return the offset slot for the specified match offset, using the compressor's
   * acceleration tables to speed up the mapping.
   */
-static inline unsigned
+static forceinline unsigned
  lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
  {
         if (offset < 0xe4a5)
@@ -462,18 +456,18 @@ lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
  
  /*
   * Initialize the range encoder @rc to write forwards to the specified buffer
- * @out that is @count 16-bit integers long.
+ * @out that is @size bytes long.
   */
  static void
-lzms_range_encoder_init(struct lzms_range_encoder *rc, le16 *out, size_t count)
+lzms_range_encoder_init(struct lzms_range_encoder *rc, u8 *out, size_t size)
  {
         rc->lower_bound = 0;
         rc->range_size = 0xffffffff;
         rc->cache = 0;
         rc->cache_size = 1;
         rc->begin = out;
-       rc->next = out - 1;
-       rc->end = out + count;
+       rc->next = out - sizeof(le16);
+       rc->end = out + (size & ~1);
  }
  
  /*
@@ -504,12 +498,13 @@ lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
                 do {
                         if (likely(rc->next >= rc->begin)) {
                                 if (rc->next != rc->end) {
-                                       put_unaligned_u16_le(rc->cache +
-                                                            (u16)(rc->lower_bound >> 32),
-                                                            rc->next++);
+                                       put_unaligned_le16(rc->cache +
+                                                          (u16)(rc->lower_bound >> 32),
+                                                          rc->next);
+                                       rc->next += sizeof(le16);
                                 }
                         } else {
-                               rc->next++;
+                               rc->next += sizeof(le16);
                         }
                         rc->cache = 0xffff;
                 } while (--rc->cache_size != 0);
@@ -534,7 +529,7 @@ lzms_range_encoder_flush(struct lzms_range_encoder *rc)
   * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
   * bit is 0 rather than 1.
   */
-static inline void
+static forceinline void
  lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
  {
         /* Normalize if needed.  */
@@ -556,7 +551,7 @@ lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
   * Encode a bit.  This wraps around lzms_range_encode_bit() to handle using and
   * updating the state and its corresponding probability entry.
   */
-static inline void
+static forceinline void
  lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
                 struct lzms_probability_entry *probs,
                 struct lzms_range_encoder *rc)
@@ -586,42 +581,42 @@ static void
  lzms_encode_main_bit(struct lzms_compressor *c, int bit)
  {
         lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
-                       c->main_probs, &c->rc);
+                       c->probs.main, &c->rc);
  }
  
  static void
  lzms_encode_match_bit(struct lzms_compressor *c, int bit)
  {
         lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
-                       c->match_probs, &c->rc);
+                       c->probs.match, &c->rc);
  }
  
  static void
  lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
  {
         lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
-                       c->lz_probs, &c->rc);
+                       c->probs.lz, &c->rc);
  }
  
  static void
  lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
  {
         lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
-                       c->lz_rep_probs[idx], &c->rc);
+                       c->probs.lz_rep[idx], &c->rc);
  }
  
  static void
  lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
  {
         lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
-                       c->delta_probs, &c->rc);
+                       c->probs.delta, &c->rc);
  }
  
  static void
  lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
  {
         lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
-                       c->delta_rep_probs[idx], &c->rc);
+                       c->probs.delta_rep[idx], &c->rc);
  }
  
  /******************************************************************************
@@ -630,16 +625,16 @@ lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
  
  /*
   * Initialize the output bitstream @os to write backwards to the specified
- * buffer @out that is @count 16-bit integers long.
+ * buffer @out that is @size bytes long.
   */
  static void
  lzms_output_bitstream_init(struct lzms_output_bitstream *os,
-                          le16 *out, size_t count)
+                          u8 *out, size_t size)
  {
         os->bitbuf = 0;
         os->bitcount = 0;
-       os->next = out + count;
         os->begin = out;
+       os->next = out + (size & ~1);
  }
  
  /*
@@ -649,7 +644,7 @@ lzms_output_bitstream_init(struct lzms_output_bitstream *os,
   * @max_num_bits is a compile-time constant that specifies the maximum number of
   * bits that can ever be written at this call site.
   */
-static inline void
+static forceinline void
  lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
                 const unsigned num_bits, const unsigned max_num_bits)
  {
@@ -663,8 +658,10 @@ lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
                 os->bitcount -= 16;
  
                 /* Write a coding unit, unless it would underflow the buffer. */
-               if (os->next != os->begin)
-                       put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next);
+               if (os->next != os->begin) {
+                       os->next -= sizeof(le16);
+                       put_unaligned_le16(os->bitbuf >> os->bitcount, os->next);
+               }
  
                 /* Optimization for call sites that never write more than 16
                  * bits at once.  */
@@ -684,8 +681,10 @@ lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
         if (os->next == os->begin)
                 return false;
  
-       if (os->bitcount != 0)
-               put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next);
+       if (os->bitcount != 0) {
+               os->next -= sizeof(le16);
+               put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
+       }
  
         return true;
  }
@@ -726,7 +725,7 @@ lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
   * Encode a symbol using the specified Huffman code.  Then, if the Huffman code
   * needs to be rebuilt, rebuild it and return true; otherwise return false.
   */
-static inline bool
+static forceinline bool
  lzms_huffman_encode_symbol(unsigned sym,
                            const u32 *codewords, const u8 *lens, u32 *freqs,
                            struct lzms_output_bitstream *os,
@@ -937,7 +936,7 @@ lzms_encode_nonempty_item_list(struct lzms_compressor *c,
         } while (cur_node != end_node);
  }
  
-static inline void
+static forceinline void
  lzms_encode_item_list(struct lzms_compressor *c,
                       struct lzms_optimum_node *end_node)
  {
@@ -946,7 +945,7 @@ lzms_encode_item_list(struct lzms_compressor *c,
  }
  
  /******************************************************************************
- *                             Cost evalution                                 *
+ *                             Cost evaluation                                *
   ******************************************************************************/
  
  /*
@@ -977,11 +976,11 @@ static const u32 lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR + 1] = {
         1
  };
  
-static inline void
+static _unused_attribute void
  check_cost_shift(void)
  {
         /* lzms_bit_costs is hard-coded to the current COST_SHIFT.  */
-       BUILD_BUG_ON(COST_SHIFT != 6);
+       STATIC_ASSERT(COST_SHIFT == 6);
  }
  
  #if 0
@@ -1004,14 +1003,14 @@ lzms_compute_bit_costs(void)
  #endif
  
  /* Return the cost to encode a 0 bit in the specified context.  */
-static inline u32
+static forceinline u32
  lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
  {
         return lzms_bit_costs[probs[state].num_recent_zero_bits];
  }
  
  /* Return the cost to encode a 1 bit in the specified context.  */
-static inline u32
+static forceinline u32
  lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
  {
         return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
@@ -1019,10 +1018,10 @@ lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
  }
  
  /* Return the cost to encode a literal, including the main bit.  */
-static inline u32
+static forceinline u32
  lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
  {
-       return lzms_bit_0_cost(main_state, c->main_probs) +
+       return lzms_bit_0_cost(main_state, c->probs.main) +
                 ((u32)c->literal_lens[literal] << COST_SHIFT);
  }
  
@@ -1044,14 +1043,14 @@ lzms_update_fast_length_costs(struct lzms_compressor *c)
  
  /* Return the cost to encode the specified match length, which must not exceed
   * MAX_FAST_LENGTH.  */
-static inline u32
+static forceinline u32
  lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
  {
         return c->fast_length_cost_tab[length];
  }
  
  /* Return the cost to encode the specified LZ match offset.  */
-static inline u32
+static forceinline u32
  lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
  {
         unsigned slot = lzms_comp_get_offset_slot(c, offset);
@@ -1060,7 +1059,7 @@ lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
  }
  
  /* Return the cost to encode the specified delta power and raw offset.  */
-static inline u32
+static forceinline u32
  lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
  {
         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
@@ -1123,31 +1122,31 @@ lzms_update_lru_queues(struct lzms_adaptive_state *state)
         state->prev_delta_pair = state->upcoming_delta_pair;
  }
  
-static inline void
+static forceinline void
  lzms_update_state(u8 *state_p, int bit, unsigned num_states)
  {
-       *state_p = ((*state_p << 1) | bit) % num_states;
+       *state_p = ((*state_p << 1) | bit) & (num_states - 1);
  }
  
-static inline void
+static forceinline void
  lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
  {
         lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
  }
  
-static inline void
+static forceinline void
  lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
  {
         lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
  }
  
-static inline void
+static forceinline void
  lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
  {
         lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
  }
  
-static inline void
+static forceinline void
  lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
  {
         for (int i = 0; i < rep_idx; i++)
@@ -1157,13 +1156,13 @@ lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
                 lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
  }
  
-static inline void
+static forceinline void
  lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
  {
         lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
  }
  
-static inline void
+static forceinline void
  lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
  {
         for (int i = 0; i < rep_idx; i++)
@@ -1186,7 +1185,7 @@ static void
  lzms_init_delta_matchfinder(struct lzms_compressor *c)
  {
         /* Set all entries to use an invalid power, which will never match.  */
-       BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER >= (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
+       STATIC_ASSERT(NUM_POWERS_TO_CONSIDER < (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
         memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
  
         /* Initialize the next hash code for each power.  We can just use zeroes
@@ -1200,8 +1199,8 @@ lzms_init_delta_matchfinder(struct lzms_compressor *c)
   * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
   * delta context with the specified @span.
   */
-static inline u32
-lzms_delta_hash(const u8 *p, u32 span)
+static forceinline u32
+lzms_delta_hash(const u8 *p, const u32 pos, u32 span)
  {
         /* A delta match has a certain span and an offset that is a multiple of
          * that span.  To reduce wasted space we use a single combined hash
@@ -1209,11 +1208,11 @@ lzms_delta_hash(const u8 *p, u32 span)
          * include in the hash code computation the span and the low-order bits
          * of the current position.  */
  
-       BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3);
+       STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
         u8 d0 = *(p + 0) - *(p + 0 - span);
         u8 d1 = *(p + 1) - *(p + 1 - span);
         u8 d2 = *(p + 2) - *(p + 2 - span);
-       u32 v = ((span + ((u32)(uintptr_t)p & (span - 1))) << 24) |
+       u32 v = ((span + (pos & (span - 1))) << 24) |
                 ((u32)d2 << 16) | ((u32)d1 << 8) | d0;
         return lz_hash(v, DELTA_HASH_ORDER);
  }
@@ -1223,7 +1222,7 @@ lzms_delta_hash(const u8 *p, u32 span)
   * specified @span and having the initial @len, extend the match as far as
   * possible, up to a limit of @max_len.
   */
-static inline u32
+static forceinline u32
  lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
                         u32 len, u32 max_len, u32 span)
  {
@@ -1249,12 +1248,12 @@ lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
                         const u32 span = (u32)1 << power;
                         if (unlikely(pos < span))
                                 continue;
-                       const u32 next_hash = lzms_delta_hash(in_next + 1, span);
+                       const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
                         const u32 hash = c->next_delta_hashes[power];
                         c->delta_hash_table[hash] =
                                 (power << DELTA_SOURCE_POWER_SHIFT) | pos;
                         c->next_delta_hashes[power] = next_hash;
-                       prefetch(&c->delta_hash_table[next_hash]);
+                       prefetchw(&c->delta_hash_table[next_hash]);
                 }
         } while (in_next++, pos++, --count);
  }
@@ -1286,10 +1285,10 @@ lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
   * can be reached using a match or literal from the current position.  This is
   * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
   * the graph edges are possible matches/literals to code, and the cost of each
- * edge is the estimated number of bits that will be required to output the
- * corresponding match or literal.  But one difference is that we actually
- * compute the lowest-cost path in pieces, where each piece is terminated when
- * there are no choices to be made.
+ * edge is the estimated number of bits (scaled up by COST_SHIFT) that will be
+ * required to output the corresponding match or literal.  But one difference is
+ * that we actually compute the lowest-cost path in pieces, where each piece is
+ * terminated when there are no choices to be made.
   *
   * The costs of literals and matches are estimated using the range encoder
   * states and the semi-adaptive Huffman codes.  Except for range encoding
@@ -1379,19 +1378,19 @@ begin:
  
                                 u32 base_cost = cur_node->cost +
                                                 lzms_bit_1_cost(cur_node->state.main_state,
-                                                               c->main_probs) +
+                                                               c->probs.main) +
                                                 lzms_bit_0_cost(cur_node->state.match_state,
-                                                               c->match_probs) +
+                                                               c->probs.match) +
                                                 lzms_bit_1_cost(cur_node->state.lz_state,
-                                                               c->lz_probs);
+                                                               c->probs.lz);
  
                                 for (int i = 0; i < rep_idx; i++)
                                         base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
-                                                                    c->lz_rep_probs[i]);
+                                                                    c->probs.lz_rep[i]);
  
                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
                                         base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
-                                                                    c->lz_rep_probs[rep_idx]);
+                                                                    c->probs.lz_rep[rep_idx]);
  
                                 u32 len = 2;
                                 do {
@@ -1441,10 +1440,10 @@ begin:
                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
  
                                         /* add LZ-rep0 cost  */
-                                       cost += lzms_bit_1_cost(main_state, c->main_probs) +
-                                               lzms_bit_0_cost(match_state, c->match_probs) +
-                                               lzms_bit_1_cost(lz_state, c->lz_probs) +
-                                               lzms_bit_0_cost(lz_rep0_state, c->lz_rep_probs[0]) +
+                                       cost += lzms_bit_1_cost(main_state, c->probs.main) +
+                                               lzms_bit_0_cost(match_state, c->probs.match) +
+                                               lzms_bit_1_cost(lz_state, c->probs.lz) +
+                                               lzms_bit_0_cost(lz_rep0_state, c->probs.lz_rep[0]) +
                                                 lzms_fast_length_cost(c, rep0_len);
  
                                         const u32 total_len = rep_len + 1 + rep0_len;
@@ -1531,19 +1530,19 @@ begin:
  
                                 u32 base_cost = cur_node->cost +
                                                 lzms_bit_1_cost(cur_node->state.main_state,
-                                                               c->main_probs) +
+                                                               c->probs.main) +
                                                 lzms_bit_1_cost(cur_node->state.match_state,
-                                                               c->match_probs) +
+                                                               c->probs.match) +
                                                 lzms_bit_1_cost(cur_node->state.delta_state,
-                                                               c->delta_probs);
+                                                               c->probs.delta);
  
                                 for (int i = 0; i < rep_idx; i++)
                                         base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
-                                                                    c->delta_rep_probs[i]);
+                                                                    c->probs.delta_rep[i]);
  
                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
                                         base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
-                                                                    c->delta_rep_probs[rep_idx]);
+                                                                    c->probs.delta_rep[rep_idx]);
  
                                 u32 len = 2;
                                 do {
@@ -1600,11 +1599,11 @@ begin:
  
                         u32 base_cost = cur_node->cost +
                                         lzms_bit_1_cost(cur_node->state.main_state,
-                                                       c->main_probs) +
+                                                       c->probs.main) +
                                         lzms_bit_0_cost(cur_node->state.match_state,
-                                                       c->match_probs) +
+                                                       c->probs.match) +
                                         lzms_bit_0_cost(cur_node->state.lz_state,
-                                                       c->lz_probs);
+                                                       c->probs.lz);
  
                         if (c->try_lzmatch_lit_lzrep0 &&
                             likely(in_end - (in_next + c->matches[0].length) >= 3))
@@ -1660,11 +1659,11 @@ begin:
                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
  
                                         /* add LZ-rep0 cost  */
-                                       cost += lzms_bit_1_cost(main_state, c->main_probs) +
-                                               lzms_bit_0_cost(match_state, c->match_probs) +
-                                               lzms_bit_1_cost(lz_state, c->lz_probs) +
+                                       cost += lzms_bit_1_cost(main_state, c->probs.main) +
+                                               lzms_bit_0_cost(match_state, c->probs.match) +
+                                               lzms_bit_1_cost(lz_state, c->probs.lz) +
                                                 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
-                                                               c->lz_rep_probs[0]) +
+                                                               c->probs.lz_rep[0]) +
                                                 lzms_fast_length_cost(c, rep0_len);
  
                                         const u32 total_len = len + 1 + rep0_len;
@@ -1718,7 +1717,7 @@ begin:
                         const u32 pos = in_next - c->in_buffer;
  
                         /* Consider each possible power (log2 of span)  */
-                       BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER > LZMS_NUM_DELTA_POWER_SYMS);
+                       STATIC_ASSERT(NUM_POWERS_TO_CONSIDER <= LZMS_NUM_DELTA_POWER_SYMS);
                         for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
  
                                 const u32 span = (u32)1 << power;
@@ -1726,13 +1725,13 @@ begin:
                                 if (unlikely(pos < span))
                                         continue;
  
-                               const u32 next_hash = lzms_delta_hash(in_next + 1, span);
+                               const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
                                 const u32 hash = c->next_delta_hashes[power];
                                 const u32 cur_match = c->delta_hash_table[hash];
  
                                 c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
                                 c->next_delta_hashes[power] = next_hash;
-                               prefetch(&c->delta_hash_table[next_hash]);
+                               prefetchw(&c->delta_hash_table[next_hash]);
  
                                 if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
                                         continue;
@@ -1747,7 +1746,7 @@ begin:
  
                                 /* Check the first 3 bytes before entering the
                                  * extension loop.  */
-                               BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3);
+                               STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
@@ -1759,11 +1758,16 @@ begin:
                                 /* Extend the delta match to its full length.  */
                                 const u32 len = lzms_extend_delta_match(in_next,
                                                                         matchptr,
-                                                                       3,
+                                                                       NBYTES_HASHED_FOR_DELTA,
                                                                         in_end - in_next,
                                                                         span);
  
                                 const u32 raw_offset = offset >> power;
+
+                               if (unlikely(raw_offset > DELTA_SOURCE_RAW_OFFSET_MASK -
+                                                         (LZMS_NUM_DELTA_REPS - 1)))
+                                       continue;
+
                                 const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
                                                  raw_offset;
                                 const u32 source = DELTA_SOURCE_TAG |
@@ -1794,11 +1798,11 @@ begin:
  
                                 u32 base_cost = cur_node->cost +
                                                 lzms_bit_1_cost(cur_node->state.main_state,
-                                                               c->main_probs) +
+                                                               c->probs.main) +
                                                 lzms_bit_1_cost(cur_node->state.match_state,
-                                                               c->match_probs) +
+                                                               c->probs.match) +
                                                 lzms_bit_0_cost(cur_node->state.delta_state,
-                                                               c->delta_probs) +
+                                                               c->probs.delta) +
                                                 lzms_delta_source_cost(c, power, raw_offset);
  
                                 u32 l = NBYTES_HASHED_FOR_DELTA;
@@ -1852,13 +1856,13 @@ begin:
  
                                 /* Add cost of LZ-rep0  */
                                 const u32 cost = cur_and_lit_cost +
-                                                lzms_bit_1_cost(main_state, c->main_probs) +
+                                                lzms_bit_1_cost(main_state, c->probs.main) +
                                                  lzms_bit_0_cost(cur_node->state.match_state,
-                                                                c->match_probs) +
+                                                                c->probs.match) +
                                                  lzms_bit_1_cost(cur_node->state.lz_state,
-                                                                c->lz_probs) +
+                                                                c->probs.lz) +
                                                  lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
-                                                                c->lz_rep_probs[0]) +
+                                                                c->probs.lz_rep[0]) +
                                                  lzms_fast_length_cost(c, rep0_len);
  
                                 const u32 total_len = 1 + rep0_len;
@@ -1889,7 +1893,7 @@ begin:
                  * Finalize the adaptive state that results from taking this
                  * lowest-cost path.  */
                 struct lzms_item item_to_take = cur_node->item;
-               struct lzms_optimum_node *source_node = cur_node - (item_to_take.length);
+               struct lzms_optimum_node *source_node = cur_node - item_to_take.length;
                 int next_item_idx = -1;
                 for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
                         item_to_take = cur_node->extra_items[i];
@@ -1916,9 +1920,9 @@ begin:
  
                                         if (source >= LZMS_NUM_DELTA_REPS) {
                                                 /* Explicit offset delta match  */
-                                               u32 pair = source - (LZMS_NUM_DELTA_REPS - 1);
                                                 lzms_update_delta_state(&cur_node->state, 0);
-                                               cur_node->state.upcoming_delta_pair = pair;
+                                               cur_node->state.upcoming_delta_pair =
+                                                       source - (LZMS_NUM_DELTA_REPS - 1);
                                         } else {
                                                 /* Repeat offset delta match  */
                                                 int rep_idx = source;
@@ -2017,14 +2021,7 @@ lzms_init_states_and_probabilities(struct lzms_compressor *c)
         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
                 c->delta_rep_states[i] = 0;
  
-       lzms_init_probability_entries(c->main_probs, LZMS_NUM_MAIN_PROBS);
-       lzms_init_probability_entries(c->match_probs, LZMS_NUM_MATCH_PROBS);
-       lzms_init_probability_entries(c->lz_probs, LZMS_NUM_LZ_PROBS);
-       for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
-               lzms_init_probability_entries(c->lz_rep_probs[i], LZMS_NUM_LZ_REP_PROBS);
-       lzms_init_probability_entries(c->delta_probs, LZMS_NUM_DELTA_PROBS);
-       for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
-               lzms_init_probability_entries(c->delta_rep_probs[i], LZMS_NUM_DELTA_REP_PROBS);
+       lzms_init_probabilities(&c->probs);
  }
  
  static void
@@ -2076,8 +2073,8 @@ lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots)
  static size_t
  lzms_finalize(struct lzms_compressor *c)
  {
-       size_t num_forwards_units;
-       size_t num_backwards_units;
+       size_t num_forwards_bytes;
+       size_t num_backwards_bytes;
  
         /* Flush both the forwards and backwards streams, and make sure they
          * didn't cross each other and start overwriting each other's data.  */
@@ -2095,12 +2092,12 @@ lzms_finalize(struct lzms_compressor *c)
          * bitstream.  Move the data output by the backwards bitstream to be
          * adjacent to the data output by the forward bitstream, and calculate
          * the compressed size that this results in.  */
-       num_forwards_units = c->rc.next - c->rc.begin;
-       num_backwards_units = c->rc.end - c->os.next;
+       num_forwards_bytes = c->rc.next - c->rc.begin;
+       num_backwards_bytes = c->rc.end - c->os.next;
  
-       memmove(c->rc.next, c->os.next, num_backwards_units * sizeof(le16));
+       memmove(c->rc.next, c->os.next, num_backwards_bytes);
  
-       return (num_forwards_units + num_backwards_units) * sizeof(le16);
+       return num_forwards_bytes + num_backwards_bytes;
  }
  
  static u64
@@ -2174,8 +2171,8 @@ oom0:
  }
  
  static size_t
-lzms_compress(const void *in, size_t in_nbytes,
-             void *out, size_t out_nbytes_avail, void *_c)
+lzms_compress(const void *restrict in, size_t in_nbytes,
+             void *restrict out, size_t out_nbytes_avail, void *restrict _c)
  {
         struct lzms_compressor *c = _c;
         size_t result;
@@ -2198,8 +2195,8 @@ lzms_compress(const void *in, size_t in_nbytes,
                 lzms_init_delta_matchfinder(c);
  
         /* Initialize the encoder structures.  */
-       lzms_range_encoder_init(&c->rc, out, out_nbytes_avail / sizeof(le16));
-       lzms_output_bitstream_init(&c->os, out, out_nbytes_avail / sizeof(le16));
+       lzms_range_encoder_init(&c->rc, out, out_nbytes_avail);
+       lzms_output_bitstream_init(&c->os, out, out_nbytes_avail);
         lzms_init_states_and_probabilities(c);
         lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes));