]> wimlib.net Git - wimlib/blobdiff - src/xpress_compress.c
Stop force-inlining everything marked 'inline'
[wimlib] / src / xpress_compress.c
index bce0901f523665b8b95aa1ae73864d1e8ccfc600..1b430912de7521d5b2abb622f6b1a694f7ab6281 100644 (file)
@@ -212,7 +212,7 @@ struct xpress_output_bitstream {
        /* Pointer to the start of the output buffer.  */
        u8 *start;
 
-       /* Pointer to the location in the ouput buffer at which to write the
+       /* Pointer to the location in the output buffer at which to write the
         * next 16 bits.  */
        u8 *next_bits;
 
@@ -279,7 +279,7 @@ xpress_init_output(struct xpress_output_bitstream *os, void *buffer, size_t size
  * If the output buffer space is exhausted, then the bits will be ignored, and
  * xpress_flush_output() will return 0 when it gets called.
  */
-static inline void
+static forceinline void
 xpress_write_bits(struct xpress_output_bitstream *os,
                  const u32 bits, const unsigned num_bits)
 {
@@ -292,7 +292,7 @@ xpress_write_bits(struct xpress_output_bitstream *os,
        if (os->bitcount > 16) {
                os->bitcount -= 16;
                if (os->end - os->next_byte >= 2) {
-                       put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next_bits);
+                       put_unaligned_le16(os->bitbuf >> os->bitcount, os->next_bits);
                        os->next_bits = os->next_bits2;
                        os->next_bits2 = os->next_byte;
                        os->next_byte += 2;
@@ -303,7 +303,7 @@ xpress_write_bits(struct xpress_output_bitstream *os,
 /*
  * Interweave a literal byte into the output bitstream.
  */
-static inline void
+static forceinline void
 xpress_write_byte(struct xpress_output_bitstream *os, u8 byte)
 {
        if (os->next_byte < os->end)
@@ -313,11 +313,11 @@ xpress_write_byte(struct xpress_output_bitstream *os, u8 byte)
 /*
  * Interweave two literal bytes into the output bitstream.
  */
-static inline void
+static forceinline void
 xpress_write_u16(struct xpress_output_bitstream *os, u16 v)
 {
        if (os->end - os->next_byte >= 2) {
-               put_unaligned_u16_le(v, os->next_byte);
+               put_unaligned_le16(v, os->next_byte);
                os->next_byte += 2;
        }
 }
@@ -332,13 +332,13 @@ xpress_flush_output(struct xpress_output_bitstream *os)
        if (os->end - os->next_byte < 2)
                return 0;
 
-       put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next_bits);
-       put_unaligned_u16_le(0, os->next_bits2);
+       put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next_bits);
+       put_unaligned_le16(0, os->next_bits2);
 
        return os->next_byte - os->start;
 }
 
-static inline void
+static forceinline void
 xpress_write_extra_length_bytes(struct xpress_output_bitstream *os,
                                unsigned adjusted_len)
 {
@@ -353,7 +353,7 @@ xpress_write_extra_length_bytes(struct xpress_output_bitstream *os,
 }
 
 /* Output a match or literal.  */
-static inline void
+static forceinline void
 xpress_write_item(struct xpress_item item, struct xpress_output_bitstream *os,
                  const u32 codewords[], const u8 lens[])
 {
@@ -413,7 +413,7 @@ xpress_write_item_list(struct xpress_output_bitstream *os,
                        unsigned sym;
 
                        adjusted_len = length - XPRESS_MIN_MATCH_LEN;
-                       log2_offset = fls32(offset);
+                       log2_offset = bsr32(offset);
                        len_hdr = min(0xF, adjusted_len);
                        sym = XPRESS_NUM_CHARS + ((log2_offset << 4) | len_hdr);
 
@@ -484,7 +484,7 @@ xpress_write(struct xpress_compressor *c, void *out, size_t out_nbytes_avail,
 
 /* Tally the Huffman symbol for a literal and return the intermediate
  * representation of that literal.  */
-static inline struct xpress_item
+static forceinline struct xpress_item
 xpress_record_literal(struct xpress_compressor *c, unsigned literal)
 {
        c->freqs[literal]++;
@@ -496,12 +496,12 @@ xpress_record_literal(struct xpress_compressor *c, unsigned literal)
 
 /* Tally the Huffman symbol for a match and return the intermediate
  * representation of that match.  */
-static inline struct xpress_item
+static forceinline struct xpress_item
 xpress_record_match(struct xpress_compressor *c, unsigned length, unsigned offset)
 {
        unsigned adjusted_len = length - XPRESS_MIN_MATCH_LEN;
        unsigned len_hdr = min(adjusted_len, 0xF);
-       unsigned log2_offset = fls32(offset);
+       unsigned log2_offset = bsr32(offset);
        unsigned sym = XPRESS_NUM_CHARS + ((log2_offset << 4) | len_hdr);
 
        c->freqs[sym]++;
@@ -755,7 +755,7 @@ xpress_tally_item_list(struct xpress_compressor *c,
                        unsigned sym;
 
                        adjusted_len = length - XPRESS_MIN_MATCH_LEN;
-                       log2_offset = fls32(offset);
+                       log2_offset = bsr32(offset);
                        len_hdr = min(0xF, adjusted_len);
                        sym = XPRESS_NUM_CHARS + ((log2_offset << 4) | len_hdr);
 
@@ -831,7 +831,7 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes,
                                u32 offset_cost;
 
                                offset = match->offset;
-                               log2_offset = fls32(offset);
+                               log2_offset = bsr32(offset);
                                offset_cost = log2_offset;
                                do {
                                        unsigned len_hdr;
@@ -860,7 +860,7 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes,
                                u32 offset_cost;
 
                                offset = match->offset;
-                               log2_offset = fls32(offset);
+                               log2_offset = bsr32(offset);
                                offset_cost = log2_offset;
                                do {
                                        unsigned adjusted_len;
@@ -906,27 +906,23 @@ xpress_find_matches(struct xpress_compressor * restrict c,
 {
        const u8 * const in_begin = in;
        const u8 *in_next = in_begin;
-       const u8 * const in_end = in_begin + in_nbytes;
        struct lz_match *cache_ptr = c->match_cache;
-       u32 next_hash = 0;
+       u32 next_hashes[2] = {};
+       u32 max_len = in_nbytes;
+       u32 nice_len = min(max_len, c->nice_match_length);
 
        bt_matchfinder_init(&c->bt_mf);
 
-       do {
+       for (;;) {
                struct lz_match *matches;
-               unsigned best_len;
+               u32 best_len;
 
                /* If we've found so many matches that the cache might overflow
                 * if we keep finding more, then stop finding matches.  This
                 * case is very unlikely.  */
-               if (unlikely(cache_ptr >= c->cache_overflow_mark)) {
-                       do {
-                               cache_ptr->length = 0;
-                               cache_ptr->offset = *in_next++;
-                               cache_ptr++;
-                       } while (in_next != in_end);
-                       return cache_ptr;
-               }
+               if (unlikely(cache_ptr >= c->cache_overflow_mark ||
+                            max_len < BT_MATCHFINDER_REQUIRED_NBYTES))
+                       break;
 
                matches = cache_ptr;
 
@@ -937,16 +933,17 @@ xpress_find_matches(struct xpress_compressor * restrict c,
                        bt_matchfinder_get_matches(&c->bt_mf,
                                                   in_begin,
                                                   in_next - in_begin,
-                                                  in_end - in_next,
-                                                  min(in_end - in_next, c->nice_match_length),
+                                                  max_len,
+                                                  nice_len,
                                                   c->max_search_depth,
-                                                  &next_hash,
+                                                  next_hashes,
                                                   &best_len,
                                                   cache_ptr);
                cache_ptr->length = cache_ptr - matches;
-               cache_ptr->offset = *in_next;
-               in_next++;
+               cache_ptr->offset = *in_next++;
                cache_ptr++;
+               max_len--;
+               nice_len = min(nice_len, max_len);
 
                /*
                 * If there was a very long match found, then don't cache any
@@ -958,24 +955,32 @@ xpress_find_matches(struct xpress_compressor * restrict c,
                 * very much.  If there's a long match, then the data must be
                 * highly compressible, so it doesn't matter as much what we do.
                 */
-               if (best_len >= c->nice_match_length) {
+               if (best_len >= nice_len) {
+                       if (unlikely(best_len +
+                                    BT_MATCHFINDER_REQUIRED_NBYTES >= max_len))
+                               break;
                        --best_len;
                        do {
                                bt_matchfinder_skip_position(&c->bt_mf,
                                                             in_begin,
                                                             in_next - in_begin,
-                                                            in_end - in_next,
-                                                            min(in_end - in_next,
-                                                                c->nice_match_length),
+                                                            nice_len,
                                                             c->max_search_depth,
-                                                            &next_hash);
-
+                                                            next_hashes);
                                cache_ptr->length = 0;
                                cache_ptr->offset = *in_next++;
                                cache_ptr++;
+                               max_len--;
+                               nice_len = min(nice_len, max_len);
                        } while (--best_len);
                }
-       } while (in_next != in_end);
+       }
+
+       while (max_len--) {
+               cache_ptr->length = 0;
+               cache_ptr->offset = *in_next++;
+               cache_ptr++;
+       }
 
        return cache_ptr;
 }
@@ -1086,12 +1091,12 @@ xpress_create_compressor(size_t max_bufsize, unsigned compression_level,
 
                if (compression_level < 30) {
                        c->impl = xpress_compress_greedy;
-                       c->max_search_depth = (compression_level * 24) / 16;
-                       c->nice_match_length = (compression_level * 48) / 16;
+                       c->max_search_depth = (compression_level * 30) / 16;
+                       c->nice_match_length = (compression_level * 60) / 16;
                } else {
                        c->impl = xpress_compress_lazy;
-                       c->max_search_depth = (compression_level * 24) / 32;
-                       c->nice_match_length = (compression_level * 48) / 32;
+                       c->max_search_depth = (compression_level * 30) / 32;
+                       c->nice_match_length = (compression_level * 60) / 32;
 
                        /* xpress_compress_lazy() needs max_search_depth >= 2
                         * because it halves the max_search_depth when
@@ -1118,8 +1123,8 @@ xpress_create_compressor(size_t max_bufsize, unsigned compression_level,
                        &c->match_cache[max_bufsize * CACHE_RESERVE_PER_POS];
 
                c->impl = xpress_compress_near_optimal;
-               c->max_search_depth = (compression_level * 32) / 100;
-               c->nice_match_length = (compression_level * 50) / 100;
+               c->max_search_depth = (compression_level * 28) / 100;
+               c->nice_match_length = (compression_level * 56) / 100;
                c->num_optim_passes = compression_level / 40;
        }
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */