From 347be02d2a643d500ff64dbec883f8367c48b956 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Jul 2016 15:46:04 -0700 Subject: [PATCH] dec64table_dec32table_lzcopy --- include/wimlib/decompress_common.h | 87 ++++++++++++------------------ 1 file changed, 34 insertions(+), 53 deletions(-) diff --git a/include/wimlib/decompress_common.h b/include/wimlib/decompress_common.h index a67d7986..dc1bfa85 100644 --- a/include/wimlib/decompress_common.h +++ b/include/wimlib/decompress_common.h @@ -437,6 +437,9 @@ repeat_byte(u8 b) return repeat_u16(((u16)b << 8) | b); } +static const unsigned dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4}; +static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; + /* * Copy an LZ77 match of 'length' bytes from the match source at 'out_next - * offset' to the match destination at 'out_next'. The source and destination @@ -484,61 +487,39 @@ lz_copy(u32 length, u32 offset, u8 *out_begin, u8 *out_next, u8 *out_end, return -1; end = out_next + length; - /* - * Try to copy one word at a time. On i386 and x86_64 this is faster - * than copying one byte at a time, unless the data is near-random and - * all the matches have very short lengths. Note that since this - * requires unaligned memory accesses, it won't necessarily be faster on - * every architecture. - * - * Also note that we might copy more than the length of the match. For - * example, if a word is 8 bytes and the match is of length 5, then - * we'll simply copy 8 bytes. This is okay as long as we don't write - * beyond the end of the output buffer, hence the check for (out_end - - * end >= WORDBYTES - 1). - */ - if (UNALIGNED_ACCESS_IS_FAST && likely(out_end - end >= WORDBYTES - 1)) - { - if (offset >= WORDBYTES) { - /* The source and destination words don't overlap. */ - do { - copy_word_unaligned(src, out_next); - src += WORDBYTES; - out_next += WORDBYTES; - } while (out_next < end); - return 0; - } else if (offset == 1) { - /* Offset 1 matches are equivalent to run-length - * encoding of the previous byte. This case is common - * if the data contains many repeated bytes. */ - machine_word_t v = repeat_byte(*src); - do { - store_word_unaligned(v, out_next); - out_next += WORDBYTES; - } while (out_next < end); - return 0; - } - /* - * We don't bother with special cases for other 'offset < - * WORDBYTES', which are usually rarer than 'offset == 1'. - * Extra checks will just slow things down. Actually, it's - * possible to handle all the 'offset < WORDBYTES' cases using - * the same code, but it still becomes more complicated doesn't - * seem any faster overall; it definitely slows down the more - * common 'offset == 1' case. - */ + if (!UNALIGNED_ACCESS_IS_FAST || unlikely(out_end - end < WORDBYTES - 1)) { + if (min_length >= 2) + *out_next++ = *src++; + if (min_length >= 3) + *out_next++ = *src++; + if (min_length >= 4) + *out_next++ = *src++; + do { + *out_next++ = *src++; + } while (out_next != end); + return 0; + } + + if (unlikely(offset < 8)) { + const int dec64 = dec64table[offset]; + out_next[0] = src[0]; + out_next[1] = src[1]; + out_next[2] = src[2]; + out_next[3] = src[3]; + src += dec32table[offset]; + store_u32_unaligned(load_u32_unaligned(src), out_next + 4); + src -= dec64; + } else { + copy_word_unaligned(src, out_next); + src += 8; } + out_next += 8; - /* Fall back to a bytewise copy. */ - if (min_length >= 2) - *out_next++ = *src++; - if (min_length >= 3) - *out_next++ = *src++; - if (min_length >= 4) - *out_next++ = *src++; - do { - *out_next++ = *src++; - } while (out_next != end); + while (out_next < end) { + copy_word_unaligned(src, out_next); + src += WORDBYTES; + out_next += WORDBYTES; + } return 0; } -- 2.46.1