]> wimlib.net Git - wimlib/blobdiff - src/lzms-compress.c
LZMS: decompression optimizations
[wimlib] / src / lzms-compress.c
index fb1f777ac6b5c3adeb69c1a8bf7508e603baf92f..5f5e37412db828a0c1a96d667b9e854e4b5a0959 100644 (file)
@@ -7,20 +7,18 @@
 /*
  * Copyright (C) 2013, 2014 Eric Biggers
  *
- * This file is part of wimlib, a library for working with WIM files.
+ * This file is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option) any
+ * later version.
  *
- * wimlib is free software; you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 3 of the License, or (at your option)
- * any later version.
- *
- * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- * A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * This file is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  * details.
  *
- * You should have received a copy of the GNU General Public License
- * along with wimlib; if not, see http://www.gnu.org/licenses/.
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this file; if not, see http://www.gnu.org/licenses/.
  */
 
 #ifdef HAVE_CONFIG_H
@@ -34,6 +32,7 @@
 #include "wimlib/lz_mf.h"
 #include "wimlib/lz_repsearch.h"
 #include "wimlib/lzms.h"
+#include "wimlib/unaligned.h"
 #include "wimlib/util.h"
 
 #include <string.h>
@@ -207,6 +206,33 @@ struct lzms_compressor {
        u8 offset_slot_fast[LZMS_NUM_FAST_OFFSETS];
 };
 
+struct lzms_lz_lru_queue {
+       u32 recent_offsets[LZMS_NUM_RECENT_OFFSETS + 1];
+       u32 prev_offset;
+       u32 upcoming_offset;
+};
+
+static void
+lzms_init_lz_lru_queue(struct lzms_lz_lru_queue *queue)
+{
+       for (int i = 0; i < LZMS_NUM_RECENT_OFFSETS + 1; i++)
+               queue->recent_offsets[i] = i + 1;
+
+       queue->prev_offset = 0;
+       queue->upcoming_offset = 0;
+}
+
+static void
+lzms_update_lz_lru_queue(struct lzms_lz_lru_queue *queue)
+{
+       if (queue->prev_offset != 0) {
+               for (int i = LZMS_NUM_RECENT_OFFSETS - 1; i >= 0; i--)
+                       queue->recent_offsets[i + 1] = queue->recent_offsets[i];
+               queue->recent_offsets[0] = queue->prev_offset;
+       }
+       queue->prev_offset = queue->upcoming_offset;
+}
+
 /*
  * Match chooser position data:
  *
@@ -259,7 +285,7 @@ struct lzms_mc_pos_data {
         * entries or current Huffman codewords.  Those aren't maintained
         * per-position and are only updated occassionally.  */
        struct lzms_adaptive_state {
-               struct lzms_lz_lru_queues lru;
+               struct lzms_lz_lru_queue lru;
                u8 main_state;
                u8 match_state;
                u8 lz_match_state;
@@ -340,7 +366,7 @@ lzms_output_bitstream_put_varbits(struct lzms_output_bitstream *os,
 
                /* Write a coding unit, unless it would underflow the buffer. */
                if (os->next != os->begin)
-                       *--os->next = cpu_to_le16(os->bitbuf >> os->bitcount);
+                       put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next);
 
                /* Optimization for call sites that never write more than 16
                 * bits at once.  */
@@ -349,15 +375,6 @@ lzms_output_bitstream_put_varbits(struct lzms_output_bitstream *os,
        }
 }
 
-/* Use when @num_bits is a compile-time constant.  Otherwise use
- * lzms_output_bitstream_put_bits().  */
-static inline void
-lzms_output_bitstream_put_bits(struct lzms_output_bitstream *os,
-                              u32 bits, unsigned num_bits)
-{
-       lzms_output_bitstream_put_varbits(os, bits, num_bits, num_bits);
-}
-
 /* Flush the output bitstream, ensuring that all bits written to it have been
  * written to memory.  Returns %true if all bits have been output successfully,
  * or %false if an overrun occurred.  */
@@ -368,7 +385,7 @@ lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
                return false;
 
        if (os->bitcount != 0)
-               *--os->next = cpu_to_le16(os->bitbuf << (16 - os->bitcount));
+               put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next);
 
        return true;
 }
@@ -412,9 +429,11 @@ lzms_range_encoder_raw_shift_low(struct lzms_range_encoder_raw *rc)
                 * ((rc->low >> 32) != 0, a.k.a. the carry bit is 1).  */
                do {
                        if (likely(rc->next >= rc->begin)) {
-                               if (rc->next != rc->end)
-                                       *rc->next++ = cpu_to_le16(rc->cache +
-                                                                 (u16)(rc->low >> 32));
+                               if (rc->next != rc->end) {
+                                       put_unaligned_u16_le(rc->cache +
+                                                            (u16)(rc->low >> 32),
+                                                            rc->next++);
+                               }
                        } else {
                                rc->next++;
                        }
@@ -716,7 +735,7 @@ lzms_do_init_rc_costs(void)
                for (u32 j = 0; j < LZMS_COST_SHIFT; j++) {
                        w *= w;
                        bit_count <<= 1;
-                       while (w >= (1U << 16)) {
+                       while (w >= ((u32)1 << 16)) {
                                w >>= 1;
                                ++bit_count;
                        }
@@ -888,8 +907,7 @@ lzms_consider_lz_explicit_offset_matches(const struct lzms_compressor *c,
        len = 2;
        i = 0;
        do {
-               position_cost = base_cost + lzms_lz_offset_cost(c,
-                                                               matches[i].offset);
+               position_cost = base_cost + lzms_lz_offset_cost(c, matches[i].offset);
                do {
                        cost = position_cost + lzms_fast_length_cost(c, len);
                        if (cost < (cur_optimum_ptr + len)->cost) {
@@ -907,7 +925,7 @@ lzms_init_adaptive_state(struct lzms_adaptive_state *state)
 {
        unsigned i;
 
-       lzms_init_lz_lru_queues(&state->lru);
+       lzms_init_lz_lru_queue(&state->lru);
        state->main_state = 0;
        state->match_state = 0;
        state->lz_match_state = 0;
@@ -1218,8 +1236,8 @@ begin:
                 *      the parser will not go too long without updating the
                 *      probability tables.
                 *
-                * Note: no check for end-of-block is needed because
-                * end-of-block will trigger condition (1).
+                * Note: no check for end-of-window is needed because
+                * end-of-window will trigger condition (1).
                 */
                if (cur_optimum_ptr == end_optimum_ptr ||
                    cur_optimum_ptr == c->optimum_end)
@@ -1243,10 +1261,7 @@ lzms_init_range_encoder(struct lzms_range_encoder *enc,
        enc->state = 0;
        LZMS_ASSERT(is_power_of_2(num_states));
        enc->mask = num_states - 1;
-       for (u32 i = 0; i < num_states; i++) {
-               enc->prob_entries[i].num_recent_zero_bits = LZMS_INITIAL_PROBABILITY;
-               enc->prob_entries[i].recent_bits = LZMS_INITIAL_RECENT_BITS;
-       }
+       lzms_init_probability_entries(enc->prob_entries, num_states);
 }
 
 static void
@@ -1300,7 +1315,7 @@ lzms_prepare_compressor(struct lzms_compressor *c, const u8 *udata, u32 ulen,
                                  LZMS_LZ_OFFSET_CODE_REBUILD_FREQ);
 
        lzms_init_huffman_encoder(&c->length_encoder, &c->os,
-                                 LZMS_NUM_LEN_SYMS,
+                                 LZMS_NUM_LENGTH_SYMS,
                                  LZMS_LENGTH_CODE_REBUILD_FREQ);
 
        lzms_init_huffman_encoder(&c->delta_offset_encoder, &c->os,
@@ -1376,26 +1391,26 @@ lzms_finalize(struct lzms_compressor *c, u8 *cdata, size_t csize_avail)
  * maximum window size.  */
 static void
 lzms_build_params(unsigned int compression_level,
-                 struct lzms_compressor_params *lzms_params)
+                 struct lzms_compressor_params *params)
 {
        /* Allow length 2 matches if the compression level is sufficiently high.
         */
        if (compression_level >= 45)
-               lzms_params->min_match_length = 2;
+               params->min_match_length = 2;
        else
-               lzms_params->min_match_length = 3;
+               params->min_match_length = 3;
 
        /* Scale nice_match_length and max_search_depth with the compression
         * level.  But to allow an optimization on length cost calculations,
         * don't allow nice_match_length to exceed LZMS_NUM_FAST_LENGTH.  */
-       lzms_params->nice_match_length = ((u64)compression_level * 32) / 50;
-       if (lzms_params->nice_match_length < lzms_params->min_match_length)
-               lzms_params->nice_match_length = lzms_params->min_match_length;
-       if (lzms_params->nice_match_length > LZMS_NUM_FAST_LENGTHS)
-               lzms_params->nice_match_length = LZMS_NUM_FAST_LENGTHS;
-       lzms_params->max_search_depth = compression_level;
-
-       lzms_params->optim_array_length = 1024;
+       params->nice_match_length = ((u64)compression_level * 32) / 50;
+       if (params->nice_match_length < params->min_match_length)
+               params->nice_match_length = params->min_match_length;
+       if (params->nice_match_length > LZMS_NUM_FAST_LENGTHS)
+               params->nice_match_length = LZMS_NUM_FAST_LENGTHS;
+       params->max_search_depth = compression_level;
+
+       params->optim_array_length = 1024;
 }
 
 /* Given the internal compression parameters and maximum window size, build the
@@ -1498,8 +1513,6 @@ lzms_create_compressor(size_t max_block_size, unsigned int compression_level,
                goto oom;
        c->optimum_end = &c->optimum[params.optim_array_length];
 
-       lzms_init_slots();
-
        lzms_init_rc_costs();
 
        lzms_init_fast_slots(c);