]> wimlib.net Git - wimlib/blobdiff - src/lzx_decompress.c
lzx_decompress: redeclare input_bitstream above main loop
[wimlib] / src / lzx_decompress.c
index 9f93fcf781441cc815e155dda41b6a339910c879..5747d327270717ae6892c257374088e4dfec789d 100644 (file)
@@ -93,6 +93,21 @@ struct lzx_decompressor {
                u8 extra_offset_bits[LZX_MAX_OFFSET_SLOTS];
        };
 
+       union {
+               DECODE_TABLE_WORKING_SPACE(maincode_working_space,
+                                          LZX_MAINCODE_MAX_NUM_SYMBOLS,
+                                          LZX_MAX_MAIN_CODEWORD_LEN);
+               DECODE_TABLE_WORKING_SPACE(lencode_working_space,
+                                          LZX_LENCODE_NUM_SYMBOLS,
+                                          LZX_MAX_LEN_CODEWORD_LEN);
+               DECODE_TABLE_WORKING_SPACE(alignedcode_working_space,
+                                          LZX_ALIGNEDCODE_NUM_SYMBOLS,
+                                          LZX_MAX_ALIGNED_CODEWORD_LEN);
+               DECODE_TABLE_WORKING_SPACE(precode_working_space,
+                                          LZX_PRECODE_NUM_SYMBOLS,
+                                          LZX_MAX_PRE_CODEWORD_LEN);
+       };
+
        unsigned window_order;
        unsigned num_main_syms;
 
@@ -103,7 +118,7 @@ struct lzx_decompressor {
 } _aligned_attribute(DECODE_TABLE_ALIGNMENT);
 
 /* Read a Huffman-encoded symbol using the precode. */
-static inline unsigned
+static forceinline unsigned
 read_presym(const struct lzx_decompressor *d, struct input_bitstream *is)
 {
        return read_huffsym(is, d->precode_decode_table,
@@ -111,7 +126,7 @@ read_presym(const struct lzx_decompressor *d, struct input_bitstream *is)
 }
 
 /* Read a Huffman-encoded symbol using the main code. */
-static inline unsigned
+static forceinline unsigned
 read_mainsym(const struct lzx_decompressor *d, struct input_bitstream *is)
 {
        return read_huffsym(is, d->maincode_decode_table,
@@ -119,7 +134,7 @@ read_mainsym(const struct lzx_decompressor *d, struct input_bitstream *is)
 }
 
 /* Read a Huffman-encoded symbol using the length code. */
-static inline unsigned
+static forceinline unsigned
 read_lensym(const struct lzx_decompressor *d, struct input_bitstream *is)
 {
        return read_huffsym(is, d->lencode_decode_table,
@@ -127,7 +142,7 @@ read_lensym(const struct lzx_decompressor *d, struct input_bitstream *is)
 }
 
 /* Read a Huffman-encoded symbol using the aligned offset code. */
-static inline unsigned
+static forceinline unsigned
 read_alignedsym(const struct lzx_decompressor *d, struct input_bitstream *is)
 {
        return read_huffsym(is, d->alignedcode_decode_table,
@@ -157,7 +172,8 @@ lzx_read_codeword_lens(struct lzx_decompressor *d, struct input_bitstream *is,
                                      LZX_PRECODE_NUM_SYMBOLS,
                                      LZX_PRECODE_TABLEBITS,
                                      d->precode_lens,
-                                     LZX_MAX_PRE_CODEWORD_LEN))
+                                     LZX_MAX_PRE_CODEWORD_LEN,
+                                     d->precode_working_space))
                return -1;
 
        /* Decode the codeword lengths.  */
@@ -316,10 +332,19 @@ lzx_read_block_header(struct lzx_decompressor *d, struct input_bitstream *is,
 
 /* Decompress a block of LZX-compressed data. */
 static int
-lzx_decompress_block(struct lzx_decompressor *d, struct input_bitstream *is,
+lzx_decompress_block(struct lzx_decompressor *d, struct input_bitstream *_is,
                     int block_type, u32 block_size,
                     u8 * const out_begin, u8 *out_next, u32 recent_offsets[])
 {
+       /*
+        * Redeclare the input bitstream on the stack.  This shouldn't be
+        * needed, but it can improve the main loop's performance significantly
+        * with both gcc and clang, apparently because the compiler otherwise
+        * gets confused and doesn't properly allocate registers for
+        * 'is->bitbuf' et al. and/or thinks 'is->next' may point into 'is'.
+        */
+       struct input_bitstream is_onstack = *_is;
+       struct input_bitstream *is = &is_onstack;
        u8 * const block_end = out_next + block_size;
        unsigned min_aligned_offset_slot;
 
@@ -333,14 +358,16 @@ lzx_decompress_block(struct lzx_decompressor *d, struct input_bitstream *is,
                                      d->num_main_syms,
                                      LZX_MAINCODE_TABLEBITS,
                                      d->maincode_lens,
-                                     LZX_MAX_MAIN_CODEWORD_LEN))
+                                     LZX_MAX_MAIN_CODEWORD_LEN,
+                                     d->maincode_working_space))
                return -1;
 
        if (make_huffman_decode_table(d->lencode_decode_table,
                                      LZX_LENCODE_NUM_SYMBOLS,
                                      LZX_LENCODE_TABLEBITS,
                                      d->lencode_lens,
-                                     LZX_MAX_LEN_CODEWORD_LEN))
+                                     LZX_MAX_LEN_CODEWORD_LEN,
+                                     d->lencode_working_space))
                return -1;
 
        if (block_type == LZX_BLOCKTYPE_ALIGNED) {
@@ -348,7 +375,8 @@ lzx_decompress_block(struct lzx_decompressor *d, struct input_bitstream *is,
                                              LZX_ALIGNEDCODE_NUM_SYMBOLS,
                                              LZX_ALIGNEDCODE_TABLEBITS,
                                              d->alignedcode_lens,
-                                             LZX_MAX_ALIGNED_CODEWORD_LEN))
+                                             LZX_MAX_ALIGNED_CODEWORD_LEN,
+                                             d->alignedcode_working_space))
                        return -1;
                min_aligned_offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT;
                memcpy(d->extra_offset_bits, d->extra_offset_bits_minus_aligned,
@@ -409,20 +437,14 @@ lzx_decompress_block(struct lzx_decompressor *d, struct input_bitstream *is,
                }
                recent_offsets[0] = offset;
 
-               /* Validate the match, then copy it to the current position.  */
-
-               if (unlikely(length > block_end - out_next))
+               /* Validate the match and copy it to the current position.  */
+               if (unlikely(lz_copy(length, offset, out_begin,
+                                    out_next, block_end, LZX_MIN_MATCH_LEN)))
                        return -1;
-
-               if (unlikely(offset > out_next - out_begin))
-                       return -1;
-
-               lz_copy(out_next, length, offset, block_end, LZX_MIN_MATCH_LEN);
-
                out_next += length;
-
        } while (out_next != block_end);
 
+       *_is = is_onstack;
        return 0;
 }