From 0ecb0529b5fcacc1abafa1f3f02a40c44783ada8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 9 Dec 2014 19:40:17 -0600 Subject: [PATCH] portability and compression cleanups - Split compiler.h into compiler.h and compiler-gcc.h - Compile with -std=c99. Some GCC extensions are still used but they are explicitly defined in compiler-gcc.h. - Add get_unaligned_uXX_le() and put_unaligned_uXX_le() helpers - Faster lz_copy() in some cases - Faster lz_extend() in some cases - New bitops.h header --- Makefile.am | 4 +- include/wimlib/bitops.h | 92 ++++++++++++++++++++++++ include/wimlib/compiler-gcc.h | 69 ++++++++++++++++++ include/wimlib/compiler.h | 106 ++++++++++++++++++---------- include/wimlib/decompress_common.h | 108 +++++++++++++++++++---------- include/wimlib/endianness.h | 108 ++++++++++++++++------------- include/wimlib/lz_extend.h | 75 +++++++++----------- include/wimlib/lz_hash3.h | 47 ++++++++----- include/wimlib/lz_repsearch.h | 9 +-- include/wimlib/lzx.h | 3 +- include/wimlib/types.h | 11 ++- include/wimlib/unaligned.h | 95 ++++++++++++++++++++++++- include/wimlib/util.h | 34 +-------- src/decompress_common.c | 57 ++++++++------- src/lz_hash_chains.c | 88 ++++++++++++----------- src/lzms-common.c | 15 ++-- src/lzms-compress.c | 13 ++-- src/lzms-decompress.c | 11 +-- src/lzx-common.c | 59 ++++++++-------- src/lzx-compress.c | 6 +- src/lzx-decompress.c | 3 +- src/resource.c | 7 +- src/util.c | 7 +- src/wim.c | 3 +- src/write.c | 4 +- src/xpress-compress.c | 30 ++++---- src/xpress-decompress.c | 11 +-- 27 files changed, 703 insertions(+), 372 deletions(-) create mode 100644 include/wimlib/bitops.h create mode 100644 include/wimlib/compiler-gcc.h diff --git a/Makefile.am b/Makefile.am index e49c5be6..a9846de6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,7 +3,7 @@ ACLOCAL_AMFLAGS = -I m4 AM_CPPFLAGS = -I$(top_srcdir)/include $(WINDOWS_CPPFLAGS) \ -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -AM_CFLAGS = -std=gnu99 -Wmissing-prototypes -Wstrict-prototypes \ +AM_CFLAGS = -std=c99 -Wmissing-prototypes -Wstrict-prototypes \ -Werror-implicit-function-declaration \ -fno-common -Wundef -Wno-pointer-sign @@ -82,10 +82,12 @@ libwim_la_SOURCES = \ include/wimlib/apply.h \ include/wimlib/assert.h \ include/wimlib/avl_tree.h \ + include/wimlib/bitops.h \ include/wimlib/callback.h \ include/wimlib/capture.h \ include/wimlib/case.h \ include/wimlib/compiler.h \ + include/wimlib/compiler-gcc.h \ include/wimlib/compressor_ops.h \ include/wimlib/compress_common.h \ include/wimlib/chunk_compressor.h \ diff --git a/include/wimlib/bitops.h b/include/wimlib/bitops.h new file mode 100644 index 00000000..2e7a948f --- /dev/null +++ b/include/wimlib/bitops.h @@ -0,0 +1,92 @@ +/* + * bitops.h + * + * Inline functions for bit manipulation. + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + */ + +#ifndef _WIMLIB_BITOPS_H +#define _WIMLIB_BITOPS_H + +#include "wimlib/compiler.h" +#include "wimlib/types.h" + +/* Find Last Set bit */ + +static inline unsigned +fls32(u32 v) +{ +#ifdef compiler_fls32 + return compiler_fls32(v); +#else + unsigned bit = 0; + while ((v >>= 1) != 0) + bit++; + return bit; +#endif +} + +static inline unsigned +fls64(u64 v) +{ +#ifdef compiler_fls64 + return compiler_fls64(v); +#else + unsigned bit = 0; + while ((v >>= 1) != 0) + bit++; + return bit; +#endif +} + +static inline unsigned +flsw(machine_word_t v) +{ + BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8); + if (WORDSIZE == 4) + return fls32(v); + else + return fls64(v); +} + +/* Find First Set bit */ + +static inline unsigned +ffs32(u32 v) +{ +#ifdef compiler_ffs32 + return compiler_ffs32(v); +#else + unsigned bit; + for (bit = 0; !(v & 1); bit++, v >>= 1) + ; + return bit; +#endif +} + +static inline unsigned +ffs64(u64 v) +{ +#ifdef compiler_ffs64 + return compiler_ffs64(v); +#else + unsigned bit; + for (bit = 0; !(v & 1); bit++, v >>= 1) + ; + return bit; +#endif +} + +static inline unsigned +ffsw(machine_word_t v) +{ + BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8); + if (WORDSIZE == 4) + return ffs32(v); + else + return ffs64(v); +} + +#endif /* _WIMLIB_BITOPS_H */ diff --git a/include/wimlib/compiler-gcc.h b/include/wimlib/compiler-gcc.h new file mode 100644 index 00000000..db6e3249 --- /dev/null +++ b/include/wimlib/compiler-gcc.h @@ -0,0 +1,69 @@ +/* + * compiler-gcc.h + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + */ + +#ifndef _WIMLIB_COMPILER_GCC_H +#define _WIMLIB_COMPILER_GCC_H + +#ifdef __WIN32__ +# define WIMLIBAPI __declspec(dllexport) +#else +# define WIMLIBAPI __attribute__((visibility("default"))) +#endif + +#define _packed_attribute __attribute__((packed)) +#define _aligned_attribute(n) __attribute__((aligned(n))) +#define _may_alias_attribute __attribute__((may_alias)) +#define likely(expr) __builtin_expect(!!(expr), 1) +#define unlikely(expr) __builtin_expect(!!(expr), 0) +#define prefetch(addr) __builtin_prefetch(addr) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4) +# define _cold_attribute __attribute__((cold)) +#endif +#define _malloc_attribute __attribute__((malloc)) +#define inline inline __attribute__((always_inline)) + +#define CPU_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + +#if defined(__x86_64__) || defined(__i386__) +# define UNALIGNED_ACCESS_SPEED 3 +#elif defined(__ARM_FEATURE_UNALIGNED) && (__ARM_FEATURE_UNALIGNED == 1) +# define UNALIGNED_ACCESS_SPEED 2 +#else +# define UNALIGNED_ACCESS_SPEED 0 +#endif + +#define typeof __typeof__ + +#ifndef min +# define min(a, b) ({ typeof(a) _a = (a); typeof(b) _b = (b); \ + (_a < _b) ? _a : _b; }) +#endif + +#ifndef max +# define max(a, b) ({ typeof(a) _a = (a); typeof(b) _b = (b); \ + (_a > _b) ? _a : _b; }) +#endif + +#ifndef swap +# define swap(a, b) ({ typeof(a) _a = (a); (a) = (b); (b) = _a; }) +#endif + +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) +# define compiler_bswap32 __builtin_bswap32 +# define compiler_bswap64 __builtin_bswap64 +#endif + +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) +# define compiler_bswap16 __builtin_bswap16 +#endif + +#define compiler_fls32(n) (31 - __builtin_clz(n)) +#define compiler_fls64(n) (63 - __builtin_clzll(n)) +#define compiler_ffs32(n) __builtin_ctz(n) +#define compiler_ffs64(n) __builtin_ctzll(n) + +#endif /* _WIMLIB_COMPILER_GCC_H */ diff --git a/include/wimlib/compiler.h b/include/wimlib/compiler.h index 221df688..a1f513f5 100644 --- a/include/wimlib/compiler.h +++ b/include/wimlib/compiler.h @@ -9,44 +9,72 @@ #define _WIMLIB_COMPILER_H #ifdef __GNUC__ -# if defined(__CYGWIN__) || defined(__WIN32__) -# define WIMLIBAPI __declspec(dllexport) -# else -# define WIMLIBAPI __attribute__((visibility("default"))) -# endif -# define _always_inline_attribute inline __attribute__((always_inline)) -# define _no_inline_attribute __attribute__((noinline)) -# define _packed_attribute __attribute__((packed)) -# define _format_attribute(type, format_str, args_start) \ - /*__attribute__((format(type, format_str, args_start))) */ -# if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4) -# define _cold_attribute __attribute__((cold)) -# else -# define _cold_attribute -# endif -# define _malloc_attribute __attribute__((malloc)) -# define _warn_unused_result_attribute __attribute__((warn_unused_result)) -# define _aligned_attribute(size) __attribute__((aligned(size))) -# define likely(x) __builtin_expect(!!(x), 1) -# define unlikely(x) __builtin_expect(!!(x), 0) -# define inline inline __attribute__((always_inline)) -# define prefetch(x) __builtin_prefetch(x) -# define is_constant(x) __builtin_constant_p(x) +# include "wimlib/compiler-gcc.h" #else -# define WIMLIBAPI -# define _always_inline_attribute inline -# define _no_inline_attribute -# define _format_attribute(type, format_str, args_start) -# define _cold_attribute -# define _packed_attribute -# define _malloc_attribute -# define _warn_unused_result_attribute -# define _aligned_attribute(size) -# define likely(x) (x) -# define unlikely(x) (x) -# define prefetch(x) -# define is_constant(x) (0) -#endif /* __GNUC__ */ +# error "Unrecognized compiler. Please add a header file for your compiler." +#endif + +#ifndef WIMLIBAPI +# define WIMLIBAPI +#endif + +#ifndef _packed_attribute +# error "missing required definition of _packed_attribute" +#endif + +#ifndef _aligned_attribute +# error "missing required definition of _aligned_attribute" +#endif + +#ifndef _may_alias_attribute +# error "missing required definition of _may_alias_attribute" +#endif + +#ifndef likely +# define likely(expr) (expr) +#endif + +#ifndef unlikely +# define unlikely(expr) (expr) +#endif + +#ifndef prefetch +# define prefetch(addr) +#endif + +#ifndef _cold_attribute +# define _cold_attribute +#endif + +#ifndef _malloc_attribute +# define _malloc_attribute +#endif + +#ifndef _format_attribute +# define _format_attribute(type, format_str, format_start) +#endif + +#ifndef CPU_IS_BIG_ENDIAN +# error "missing required definition of CPU_IS_BIG_ENDIAN" +#endif + +#define CPU_IS_LITTLE_ENDIAN (!CPU_IS_BIG_ENDIAN) + +#ifndef UNALIGNED_ACCESS_SPEED +# define UNALIGNED_ACCESS_SPEED 0 +#endif + +#define UNALIGNED_ACCESS_IS_ALLOWED (UNALIGNED_ACCESS_SPEED >= 1) +#define UNALIGNED_ACCESS_IS_FAST (UNALIGNED_ACCESS_SPEED >= 2) +#define UNALIGNED_ACCESS_IS_VERY_FAST (UNALIGNED_ACCESS_SPEED >= 3) + +#ifndef typeof +# error "missing required definition of typeof" +#endif + +#if !defined(min) || !defined(max) || !defined(swap) +# error "missing required definitions of min(), max(), and swap() macros" +#endif #ifdef __CHECKER__ # define _bitwise_attr __attribute__((bitwise)) @@ -56,4 +84,8 @@ # define _force_attr #endif +#ifndef BUILD_BUG_ON +# define BUILD_BUG_ON(expr) ((void)sizeof(char[1 - 2*!!(expr)])) +#endif + #endif /* _WIMLIB_COMPILER_H */ diff --git a/include/wimlib/decompress_common.h b/include/wimlib/decompress_common.h index 867e84a0..6762043c 100644 --- a/include/wimlib/decompress_common.h +++ b/include/wimlib/decompress_common.h @@ -14,6 +14,7 @@ #include "wimlib/compiler.h" #include "wimlib/endianness.h" #include "wimlib/types.h" +#include "wimlib/unaligned.h" /* Structure that encapsulates a block of in-memory data being interpreted as a * stream of bits, optionally with interwoven literal bytes. Bits are assumed @@ -67,8 +68,7 @@ bitstream_ensure_bits(struct input_bitstream *is, const unsigned num_bits) if (unlikely(is->end - is->next < 2)) goto overflow; - is->bitbuf |= (u32)le16_to_cpu(*(const le16 *)is->next) - << (16 - is->bitsleft); + is->bitbuf |= (u32)get_unaligned_u16_le(is->next) << (16 - is->bitsleft); is->next += 2; is->bitsleft += 16; @@ -76,7 +76,7 @@ bitstream_ensure_bits(struct input_bitstream *is, const unsigned num_bits) if (unlikely(is->end - is->next < 2)) goto overflow; - is->bitbuf |= (u32)le16_to_cpu(*(const le16 *)is->next); + is->bitbuf |= (u32)get_unaligned_u16_le(is->next); is->next += 2; is->bitsleft = 32; } @@ -144,7 +144,7 @@ bitstream_read_u16(struct input_bitstream *is) if (unlikely(is->end - is->next < 2)) return 0; - v = le16_to_cpu(*(const le16 *)is->next); + v = get_unaligned_u16_le(is->next); is->next += 2; return v; } @@ -157,7 +157,7 @@ bitstream_read_u32(struct input_bitstream *is) if (unlikely(is->end - is->next < 4)) return 0; - v = le32_to_cpu(*(const le32 *)is->next); + v = get_unaligned_u32_le(is->next); is->next += 4; return v; } @@ -256,7 +256,7 @@ make_huffman_decode_table(u16 decode_table[], unsigned num_syms, /* - * Copy a LZ77 match at (dst - offset) to dst. + * Copy an LZ77 match at (dst - offset) to dst. * * The length and offset must be already validated --- that is, (dst - offset) * can't underrun the output buffer, and (dst + length) can't overrun the output @@ -266,50 +266,88 @@ make_huffman_decode_table(u16 decode_table[], unsigned num_syms, * This function won't write any data beyond this position. */ static inline void -lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend) +lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend, u32 min_length) { const u8 *src = dst - offset; -#if defined(__x86_64__) || defined(__i386__) - /* Copy one 'unsigned long' at a time. On i386 and x86_64 this is + const u8 * const end = dst + length; + + /* + * Try to copy one machine word at a time. On i386 and x86_64 this is * faster than copying one byte at a time, unless the data is * near-random and all the matches have very short lengths. Note that * since this requires unaligned memory accesses, it won't necessarily * be faster on every architecture. * * Also note that we might copy more than the length of the match. For - * example, if an 'unsigned long' is 8 bytes and the match is of length - * 5, then we'll simply copy 8 bytes. This is okay as long as we don't - * write beyond the end of the output buffer, hence the check for - * (winend - (dst + length) >= sizeof(unsigned long) - 1). */ - if (offset >= sizeof(unsigned long) && - winend - (dst + length) >= sizeof(unsigned long) - 1) + * example, if a word is 8 bytes and the match is of length 5, then + * we'll simply copy 8 bytes. This is okay as long as we don't write + * beyond the end of the output buffer, hence the check for (winend - + * end >= WORDSIZE - 1). + */ + if (UNALIGNED_ACCESS_IS_VERY_FAST && + likely(winend - end >= WORDSIZE - 1)) { - /* Access memory through a packed struct. This tricks the - * compiler into allowing unaligned memory accesses. */ - struct ulong_wrapper { - unsigned long v; - } _packed_attribute; - - const u8 * const end = dst + length; - unsigned long v; - - v = ((struct ulong_wrapper *)src)->v; - ((struct ulong_wrapper *)dst)->v = v; - dst += sizeof(unsigned long); - src += sizeof(unsigned long); - if (dst < end) { + if (offset >= WORDSIZE) { + /* The source and destination words don't overlap. */ + + /* To improve branch prediction, one iteration of this + * loop is unrolled. Most matches are short and will + * fail the first check. But if that check passes, then + * it becomes increasing likely that the match is long + * and we'll need to continue copying. */ + + copy_word_unaligned(src, dst); + src += WORDSIZE; + dst += WORDSIZE; + + if (dst < end) { + do { + copy_word_unaligned(src, dst); + src += WORDSIZE; + dst += WORDSIZE; + } while (dst < end); + } + return; + } else if (offset == 1) { + + /* Offset 1 matches are equivalent to run-length + * encoding of the previous byte. This case is common + * if the data contains many repeated bytes. */ + + machine_word_t v = repeat_byte(*(dst - 1)); do { - v = ((struct ulong_wrapper *)src)->v; - ((struct ulong_wrapper *)dst)->v = v; - dst += sizeof(unsigned long); - src += sizeof(unsigned long); + store_word_unaligned(v, dst); + src += WORDSIZE; + dst += WORDSIZE; } while (dst < end); + return; } + /* + * We don't bother with special cases for other 'offset < + * WORDSIZE', which are usually rarer than 'offset == 1'. Extra + * checks will just slow things down. Actually, it's possible + * to handle all the 'offset < WORDSIZE' cases using the same + * code, but it still becomes more complicated doesn't seem any + * faster overall; it definitely slows down the more common + * 'offset == 1' case. + */ + } - return; + /* Fall back to a bytewise copy. */ + + if (min_length >= 2) { + *dst++ = *src++; + length--; + } + if (min_length >= 3) { + *dst++ = *src++; + length--; + } + if (min_length >= 4) { + *dst++ = *src++; + length--; } -#endif do { *dst++ = *src++; } while (--length); diff --git a/include/wimlib/endianness.h b/include/wimlib/endianness.h index 4c1b239f..d4de3350 100644 --- a/include/wimlib/endianness.h +++ b/include/wimlib/endianness.h @@ -1,71 +1,85 @@ +/* + * endianness.h + * + * Macros and inline functions for endianness conversion. + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + */ + #ifndef _WIMLIB_ENDIANNESS_H #define _WIMLIB_ENDIANNESS_H +#include "wimlib/compiler.h" #include "wimlib/types.h" -/* Watch out for conflict with ntfs-3g/endian.h ... */ - +/* Watch out for conflict with ntfs-3g/endians.h ... */ #ifndef _NTFS_ENDIANS_H -static inline u16 -bswap16(u16 n) +static inline u16 bswap16(u16 n) { +#ifdef compiler_bswap16 + return compiler_bswap16(n); +#else return (n << 8) | (n >> 8); +#endif } -static inline u32 -bswap32(u32 n) +static inline u32 bswap32(u32 n) { -#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) - return __builtin_bswap32(n); +#ifdef compiler_bswap32 + return compiler_bswap32(n); #else - return (n << 24) | ((n & 0xff00) << 8) | ((n & 0xff0000) >> 8) | - (n >> 24); + return (n << 24) | + ((n & 0xFF00) << 8) | + ((n & 0xFF0000) >> 8) | + (n >> 24); #endif } -static inline u64 -bswap64(u64 n) +static inline u64 bswap64(u64 n) { -#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) - return __builtin_bswap64(n); +#ifdef compiler_bswap64 + return compiler_bswap64(n); #else - return (n << 56) | ((n & 0xff00) << 40) | ((n & 0xff0000) << 24) | - ((n & 0xff000000) << 8) | ((n & 0xff00000000) >> 8) | - ((n & 0xff0000000000) >> 24) | - ((n & 0xff000000000000) >> 40) | (n >> 56); + return (n << 56) | + ((n & 0xFF00) << 40) | + ((n & 0xFF0000) << 24) | + ((n & 0xFF000000) << 8) | + ((n & 0xFF00000000) >> 8) | + ((n & 0xFF0000000000) >> 24) | + ((n & 0xFF000000000000) >> 40) | + (n >> 56); #endif } -# ifdef WORDS_BIGENDIAN -# define cpu_to_le16(n) ((_force_attr le16)bswap16(n)) -# define cpu_to_le32(n) ((_force_attr le32)bswap32(n)) -# define cpu_to_le64(n) ((_force_attr le64)bswap64(n)) -# define le16_to_cpu(n) bswap16((_force_attr u16)(le16)(n)) -# define le32_to_cpu(n) bswap32((_force_attr u32)(le32)(n)) -# define le64_to_cpu(n) bswap64((_force_attr u64)(le64)(n)) - -# define cpu_to_be16(n) ((_force_attr be16)(u16)(n)) -# define cpu_to_be32(n) ((_force_attr be32)(u32)(n)) -# define cpu_to_be64(n) ((_force_attr be64)(u64)(n)) -# define be16_to_cpu(n) ((_force_attr u16)(be16)(n)) -# define be32_to_cpu(n) ((_force_attr u32)(be32)(n)) -# define be64_to_cpu(n) ((_force_attr u64)(be64)(n)) -# else -# define cpu_to_le16(n) ((_force_attr le16)(u16)(n)) -# define cpu_to_le32(n) ((_force_attr le32)(u32)(n)) -# define cpu_to_le64(n) ((_force_attr le64)(u64)(n)) -# define le16_to_cpu(n) ((_force_attr u16)(le16)(n)) -# define le32_to_cpu(n) ((_force_attr u32)(le32)(n)) -# define le64_to_cpu(n) ((_force_attr u64)(le64)(n)) +#if CPU_IS_BIG_ENDIAN +# define cpu_to_le16(n) ((_force_attr le16)bswap16(n)) +# define cpu_to_le32(n) ((_force_attr le32)bswap32(n)) +# define cpu_to_le64(n) ((_force_attr le64)bswap64(n)) +# define le16_to_cpu(n) bswap16((_force_attr u16)(le16)(n)) +# define le32_to_cpu(n) bswap32((_force_attr u32)(le32)(n)) +# define le64_to_cpu(n) bswap64((_force_attr u64)(le64)(n)) +# define cpu_to_be16(n) ((_force_attr be16)(u16)(n)) +# define cpu_to_be32(n) ((_force_attr be32)(u32)(n)) +# define cpu_to_be64(n) ((_force_attr be64)(u64)(n)) +# define be16_to_cpu(n) ((_force_attr u16)(be16)(n)) +# define be32_to_cpu(n) ((_force_attr u32)(be32)(n)) +# define be64_to_cpu(n) ((_force_attr u64)(be64)(n)) +#else +# define cpu_to_le16(n) ((_force_attr le16)(u16)(n)) +# define cpu_to_le32(n) ((_force_attr le32)(u32)(n)) +# define cpu_to_le64(n) ((_force_attr le64)(u64)(n)) +# define le16_to_cpu(n) ((_force_attr u16)(le16)(n)) +# define le32_to_cpu(n) ((_force_attr u32)(le32)(n)) +# define le64_to_cpu(n) ((_force_attr u64)(le64)(n)) +# define cpu_to_be16(n) ((_force_attr be16)bswap16(n)) +# define cpu_to_be32(n) ((_force_attr be32)bswap32(n)) +# define cpu_to_be64(n) ((_force_attr be64)bswap64(n)) +# define be16_to_cpu(n) bswap16((_force_attr u16)(be16)(n)) +# define be32_to_cpu(n) bswap32((_force_attr u32)(be32)(n)) +# define be64_to_cpu(n) bswap64((_force_attr u64)(be64)(n)) +#endif -# define cpu_to_be16(n) ((_force_attr be16)bswap16(n)) -# define cpu_to_be32(n) ((_force_attr be32)bswap32(n)) -# define cpu_to_be64(n) ((_force_attr be64)bswap64(n)) -# define be16_to_cpu(n) bswap16((_force_attr u16)(be16)(n)) -# define be32_to_cpu(n) bswap32((_force_attr u32)(be32)(n)) -# define be64_to_cpu(n) bswap64((_force_attr u64)(be64)(n)) -# endif #endif /* _NTFS_ENDIANS_H */ - #endif /* _WIMLIB_ENDIANNESS_H */ diff --git a/include/wimlib/lz_extend.h b/include/wimlib/lz_extend.h index 7d7ed8b8..e9a56224 100644 --- a/include/wimlib/lz_extend.h +++ b/include/wimlib/lz_extend.h @@ -10,13 +10,8 @@ #ifndef _WIMLIB_LZ_EXTEND_H #define _WIMLIB_LZ_EXTEND_H -#include "wimlib/types.h" - -#if (defined(__x86_64__) || defined(__i386__)) && defined(__GNUC__) -# define HAVE_FAST_LZ_EXTEND 1 -#else -# define HAVE_FAST_LZ_EXTEND 0 -#endif +#include "wimlib/bitops.h" +#include "wimlib/unaligned.h" /* Return the number of bytes at @matchptr that match the bytes at @strptr, up * to a maximum of @max_len. Initially, @start_len bytes are matched. */ @@ -26,49 +21,45 @@ lz_extend(const u8 * const strptr, const u8 * const matchptr, { u32 len = start_len; -#if HAVE_FAST_LZ_EXTEND - - while (len + sizeof(unsigned long) <= max_len) { - unsigned long x; + if (UNALIGNED_ACCESS_IS_FAST) { - x = *(const unsigned long *)&matchptr[len] ^ - *(const unsigned long *)&strptr[len]; - if (x != 0) - return len + (__builtin_ctzl(x) >> 3); - len += sizeof(unsigned long); - } + machine_word_t v_word; - if (sizeof(unsigned int) < sizeof(unsigned long) && - len + sizeof(unsigned int) <= max_len) - { - unsigned int x; + if (likely(max_len - len >= 4 * WORDSIZE)) { - x = *(const unsigned int *)&matchptr[len] ^ - *(const unsigned int *)&strptr[len]; - if (x != 0) - return len + (__builtin_ctz(x) >> 3); - len += sizeof(unsigned int); - } + #define COMPARE_WORD_STEP \ + v_word = load_word_unaligned(&matchptr[len]) ^ \ + load_word_unaligned(&strptr[len]); \ + if (v_word != 0) \ + goto word_differs; \ + len += WORDSIZE; \ - if (sizeof(unsigned int) == 4) { - if (len < max_len && matchptr[len] == strptr[len]) { - len++; - if (len < max_len && matchptr[len] == strptr[len]) { - len++; - if (len < max_len && matchptr[len] == strptr[len]) { - len++; - } - } + COMPARE_WORD_STEP + COMPARE_WORD_STEP + COMPARE_WORD_STEP + COMPARE_WORD_STEP + #undef COMPARE_WORD_STEP } - return len; - } -#endif /* HAVE_FAST_LZ_EXTEND */ + while (len + WORDSIZE <= max_len) { + v_word = load_word_unaligned(&matchptr[len]) ^ + load_word_unaligned(&strptr[len]); + if (v_word != 0) + goto word_differs; + len += WORDSIZE; + } - while (len < max_len && matchptr[len] == strptr[len]) - len++; + while (len < max_len && matchptr[len] == strptr[len]) + len++; + return len; - return len; + word_differs: + return len + (ffsw(v_word) >> 3); + } else { + while (len < max_len && matchptr[len] == strptr[len]) + len++; + return len; + } } #endif /* _WIMLIB_LZ_EXTEND_H */ diff --git a/include/wimlib/lz_hash3.h b/include/wimlib/lz_hash3.h index feb2d9c3..b204984b 100644 --- a/include/wimlib/lz_hash3.h +++ b/include/wimlib/lz_hash3.h @@ -10,32 +10,45 @@ #ifndef _WIMLIB_LZ_HASH3_H #define _WIMLIB_LZ_HASH3_H -#include "wimlib/types.h" +#include "wimlib/unaligned.h" /* Constant for the multiplicative hash function. */ #define LZ_HASH_MULTIPLIER 0x1E35A7BD -/* Hash the next 3-byte sequence in the window, producing a hash of length - * 'num_bits' bits. 4 bytes must be available, since 32-bit unaligned load is - * faster on some architectures. */ static inline u32 -lz_hash(const u8 *p, unsigned int num_bits) +loaded_u32_to_u24(u32 v) { - u32 str; - u32 hash; + if (CPU_IS_LITTLE_ENDIAN) + return v & 0xFFFFFF; + else + return v >> 8; +} -#if defined(__i386__) || defined(__x86_64__) - /* Unaligned access allowed, and little endian CPU. - * Callers ensure that at least 4 (not 3) bytes are remaining. */ - str = *(const u32 *)p & 0xFFFFFF; -#else - str = ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); -#endif +static inline u32 +load_u24_unaligned(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return loaded_u32_to_u24(load_u32_unaligned(p)); + else + return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); +} - hash = str * LZ_HASH_MULTIPLIER; +static inline u32 +lz_hash_u24(u32 str, unsigned num_bits) +{ + return (u32)(str * LZ_HASH_MULTIPLIER) >> (32 - num_bits); +} - /* High bits are more random than the low bits. */ - return hash >> (32 - num_bits); +/* + * Hash the next 3-byte sequence in the window, producing a hash of length + * 'num_bits' bits. At least LZ_HASH_REQUIRED_NBYTES must be available at 'p'; + * this might be 4 bytes rather than 3 because an unaligned load is faster on + * some architectures. + */ +static inline u32 +lz_hash(const u8 *p, unsigned num_bits) +{ + return lz_hash_u24(load_u24_unaligned(p), num_bits); } /* The number of bytes being hashed. */ diff --git a/include/wimlib/lz_repsearch.h b/include/wimlib/lz_repsearch.h index 6883bf62..0f031c12 100644 --- a/include/wimlib/lz_repsearch.h +++ b/include/wimlib/lz_repsearch.h @@ -11,6 +11,7 @@ #define _LZ_REPSEARCH_H #include "wimlib/lz_extend.h" +#include "wimlib/unaligned.h" extern u32 lz_extend_repmatch(const u8 *strptr, const u8 *matchptr, u32 max_len); @@ -31,18 +32,18 @@ lz_repsearch3(const u8 * const strptr, const u32 max_len, unsigned rep_max_idx; u32 rep_len; u32 rep_max_len; - const u16 str = *(const u16 *)strptr; + const u16 str = load_u16_unaligned(strptr); const u8 *matchptr; matchptr = strptr - recent_offsets[0]; - if (*(const u16 *)matchptr == str) + if (load_u16_unaligned(matchptr) == str) rep_max_len = lz_extend_repmatch(strptr, matchptr, max_len); else rep_max_len = 0; rep_max_idx = 0; matchptr = strptr - recent_offsets[1]; - if (*(const u16 *)matchptr == str) { + if (load_u16_unaligned(matchptr) == str) { rep_len = lz_extend_repmatch(strptr, matchptr, max_len); if (rep_len > rep_max_len) { rep_max_len = rep_len; @@ -51,7 +52,7 @@ lz_repsearch3(const u8 * const strptr, const u32 max_len, } matchptr = strptr - recent_offsets[2]; - if (*(const u16 *)matchptr == str) { + if (load_u16_unaligned(matchptr) == str) { rep_len = lz_extend_repmatch(strptr, matchptr, max_len); if (rep_len > rep_max_len) { rep_max_len = rep_len; diff --git a/include/wimlib/lzx.h b/include/wimlib/lzx.h index da0c5514..5bd6b140 100644 --- a/include/wimlib/lzx.h +++ b/include/wimlib/lzx.h @@ -8,6 +8,7 @@ #define _WIMLIB_LZX_H #include "wimlib/assert.h" +#include "wimlib/bitops.h" #include "wimlib/compiler.h" #include "wimlib/lzx_constants.h" #include "wimlib/util.h" @@ -40,7 +41,7 @@ lzx_get_offset_slot_raw(u32 adjusted_offset) return (adjusted_offset >> 17) + 34; } else { LZX_ASSERT(2 <= adjusted_offset && adjusted_offset < 655360); - unsigned mssb_idx = bsr32(adjusted_offset); + unsigned mssb_idx = fls32(adjusted_offset); return (mssb_idx << 1) | ((adjusted_offset >> (mssb_idx - 1)) & 1); } diff --git a/include/wimlib/types.h b/include/wimlib/types.h index ac67cf4a..84b55be3 100644 --- a/include/wimlib/types.h +++ b/include/wimlib/types.h @@ -47,4 +47,13 @@ typedef struct WIMStruct WIMStruct; # define WIMLIB_WIMSTRUCT_DECLARED #endif -#endif +/* + * Type of a machine word. 'unsigned long' would be logical, but that is only + * 32 bits on x86_64 Windows. The same applies to 'uint_fast32_t'. So the best + * we can do without a bunch of #ifdefs appears to be 'size_t'. + */ +typedef size_t machine_word_t; + +#define WORDSIZE sizeof(machine_word_t) + +#endif /* _WIMLIB_TYPES_H */ diff --git a/include/wimlib/unaligned.h b/include/wimlib/unaligned.h index 7c2ade22..ccc86f01 100644 --- a/include/wimlib/unaligned.h +++ b/include/wimlib/unaligned.h @@ -10,9 +10,9 @@ #ifndef _WIMLIB_UNALIGNED_H #define _WIMLIB_UNALIGNED_H -#include "compiler.h" -#include "endianness.h" -#include "types.h" +#include "wimlib/compiler.h" +#include "wimlib/endianness.h" +#include "wimlib/types.h" #define DEFINE_UNALIGNED_TYPE(type) \ struct type##_unaligned { \ @@ -31,10 +31,99 @@ store_##type##_unaligned(type val, void *p) \ ((struct type##_unaligned *)p)->v = val; \ } +DEFINE_UNALIGNED_TYPE(u16); +DEFINE_UNALIGNED_TYPE(u32); +DEFINE_UNALIGNED_TYPE(u64); DEFINE_UNALIGNED_TYPE(le16); DEFINE_UNALIGNED_TYPE(le32); DEFINE_UNALIGNED_TYPE(le64); +DEFINE_UNALIGNED_TYPE(be16); DEFINE_UNALIGNED_TYPE(be32); +DEFINE_UNALIGNED_TYPE(be64); DEFINE_UNALIGNED_TYPE(size_t); +DEFINE_UNALIGNED_TYPE(machine_word_t); + +#define load_word_unaligned load_machine_word_t_unaligned +#define store_word_unaligned store_machine_word_t_unaligned + +static inline void +copy_word_unaligned(const void *src, void *dst) +{ + store_word_unaligned(load_word_unaligned(src), dst); +} + +static inline machine_word_t +repeat_byte(u8 b) +{ + machine_word_t v; + + BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8); + + v = b; + v |= v << 8; + v |= v << 16; + v |= v << ((WORDSIZE == 8) ? 32 : 0); + return v; +} + +static inline u16 +get_unaligned_u16_le(const void *p) +{ + u16 v; + + if (UNALIGNED_ACCESS_IS_FAST) { + v = le16_to_cpu(load_le16_unaligned(p)); + } else { + const u8 *p8 = p; + v = 0; + v |= (u16)p8[0] << 0; + v |= (u16)p8[1] << 8; + } + return v; +} + +static inline u32 +get_unaligned_u32_le(const void *p) +{ + u32 v; + + if (UNALIGNED_ACCESS_IS_FAST) { + v = le32_to_cpu(load_le32_unaligned(p)); + } else { + const u8 *p8 = p; + v = 0; + v |= (u32)p8[0] << 0; + v |= (u32)p8[1] << 8; + v |= (u32)p8[2] << 16; + v |= (u32)p8[3] << 24; + } + return v; +} + +static inline void +put_unaligned_u16_le(u16 v, void *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_le16_unaligned(cpu_to_le16(v), p); + } else { + u8 *p8 = p; + p8[0] = (v >> 0) & 0xFF; + p8[1] = (v >> 8) & 0xFF; + } +} + +static inline void +put_unaligned_u32_le(u32 v, void *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_le32_unaligned(cpu_to_le32(v), p); + } else { + u8 *p8 = p; + p8[0] = (v >> 0) & 0xFF; + p8[1] = (v >> 8) & 0xFF; + p8[2] = (v >> 16) & 0xFF; + p8[3] = (v >> 24) & 0xFF; + } +} #endif /* _WIMLIB_UNALIGNED_H */ diff --git a/include/wimlib/util.h b/include/wimlib/util.h index 91bed5a4..0aa40d1b 100644 --- a/include/wimlib/util.h +++ b/include/wimlib/util.h @@ -6,20 +6,6 @@ #include -#ifndef min -#define min(a, b) ({ typeof(a) __a = (a); typeof(b) __b = (b); \ - (__a < __b) ? __a : __b; }) -#endif - -#ifndef max -#define max(a, b) ({ typeof(a) __a = (a); typeof(b) __b = (b); \ - (__a > __b) ? __a : __b; }) -#endif - -#ifndef swap -#define swap(a, b) ({typeof(a) _a = a; (a) = (b); (b) = _a;}) -#endif - /** * container_of - cast a member of a structure out to the containing structure * @ptr: the pointer to the member. @@ -46,8 +32,6 @@ /* Used for buffering FILE IO in a few places */ #define BUFFER_SIZE 32768 -#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) - /* Maximum number of array elements to allocate on the stack (used in various * places when large temporary buffers are needed). */ #define STACK_MAX 32768 @@ -59,7 +43,7 @@ extern void wimlib_free_memory(void *p); extern void * -wimlib_realloc(void *, size_t) _warn_unused_result_attribute; +wimlib_realloc(void *, size_t); extern void * wimlib_calloc(size_t nmemb, size_t size) _malloc_attribute; @@ -107,22 +91,6 @@ randomize_char_array_with_alnum(tchar p[], size_t n); extern void print_byte_field(const u8 field[], size_t len, FILE *out); -static inline u32 -bsr32(u32 n) -{ -#if defined(__x86__) || defined(__x86_64__) - asm("bsrl %0, %0;" - : "=r"(n) - : "0" (n)); - return n; -#else - u32 pow = 0; - while ((n >>= 1) != 0) - pow++; - return pow; -#endif -} - static inline bool is_power_of_2(unsigned long n) { diff --git a/src/decompress_common.c b/src/decompress_common.c index 66440efe..4ab9ccbe 100644 --- a/src/decompress_common.c +++ b/src/decompress_common.c @@ -15,16 +15,16 @@ #endif #include "wimlib/decompress_common.h" -#include "wimlib/util.h" /* for BUILD_BUG_ON() */ #include +#define USE_WORD_FILL + #ifdef __GNUC__ # ifdef __SSE2__ +# undef USE_WORD_FILL # define USE_SSE2_FILL # include -# else -# define USE_LONG_FILL # endif #endif @@ -142,8 +142,8 @@ make_huffman_decode_table(u16 decode_table[const restrict], unsigned stores_per_loop; unsigned decode_table_pos; -#ifdef USE_LONG_FILL - const unsigned entries_per_long = sizeof(unsigned long) / sizeof(decode_table[0]); +#ifdef USE_WORD_FILL + const unsigned entries_per_word = WORDSIZE / sizeof(decode_table[0]); #endif #ifdef USE_SSE2_FILL @@ -230,7 +230,7 @@ make_huffman_decode_table(u16 decode_table[const restrict], * have the most entries. From there, the number of entries per * codeword will decrease. As an optimization, we may begin filling * entries with SSE2 vector accesses (8 entries/store), then change to - * 'unsigned long' accesses (2 or 4 entries/store), then change to + * 'machine_word_t' accesses (2 or 4 entries/store), then change to * 16-bit accesses (1 entry/store). */ decode_table_ptr = decode_table; sym_idx = 0; @@ -242,11 +242,11 @@ make_huffman_decode_table(u16 decode_table[const restrict], for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) { unsigned end_sym_idx = sym_idx + len_counts[codeword_len]; for (; sym_idx < end_sym_idx; sym_idx++) { - /* Note: unlike in the 'long' version below, the __m128i - * type already has __attribute__((may_alias)), so using - * it to access the decode table, which is an array of - * unsigned shorts, will not violate strict aliasing. - */ + /* Note: unlike in the machine_word_t version below, the + * __m128i type already has __attribute__((may_alias)), + * so using it to access the decode table, which is an + * array of unsigned shorts, will not violate strict + * aliasing. */ u16 entry; __m128i v; __m128i *p; @@ -265,36 +265,33 @@ make_huffman_decode_table(u16 decode_table[const restrict], } #endif /* USE_SSE2_FILL */ -#ifdef USE_LONG_FILL - /* Fill the entries one 'unsigned long' at a time. +#ifdef USE_WORD_FILL + /* Fill the entries one machine word at a time. * On 32-bit systems this is 2 entries per store, while on 64-bit * systems this is 4 entries per store. */ - stores_per_loop = (1 << (table_bits - codeword_len)) / entries_per_long; + stores_per_loop = (1 << (table_bits - codeword_len)) / entries_per_word; for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) { unsigned end_sym_idx = sym_idx + len_counts[codeword_len]; for (; sym_idx < end_sym_idx; sym_idx++) { - /* Accessing the array of unsigned shorts as unsigned - * longs would violate strict aliasing and would require - * compiling the code with -fno-strict-aliasing to - * guarantee correctness. To work around this problem, - * use the gcc 'may_alias' extension to define a special - * unsigned long type that may alias any other in-memory - * variable. */ - typedef unsigned long __attribute__((may_alias)) aliased_long_t; - - unsigned long v; - aliased_long_t *p; + /* Accessing the array of u16 as u32 or u64 would + * violate strict aliasing and would require compiling + * the code with -fno-strict-aliasing to guarantee + * correctness. To work around this problem, use the + * gcc 'may_alias' extension. */ + typedef machine_word_t _may_alias_attribute aliased_word_t; + + machine_word_t v; + aliased_word_t *p; unsigned n; - BUILD_BUG_ON(sizeof(unsigned long) != 4 && - sizeof(unsigned long) != 8); + BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8); v = MAKE_DIRECT_ENTRY(sorted_syms[sym_idx], codeword_len); v |= v << 16; - v |= v << (sizeof(unsigned long) == 8 ? 32 : 0); + v |= v << (WORDSIZE == 8 ? 32 : 0); - p = (aliased_long_t *)decode_table_ptr; + p = (aliased_word_t *)decode_table_ptr; n = stores_per_loop; do { @@ -303,7 +300,7 @@ make_huffman_decode_table(u16 decode_table[const restrict], decode_table_ptr = p; } } -#endif /* USE_LONG_FILL */ +#endif /* USE_WORD_FILL */ /* Fill the entries one 16-bit integer at a time. */ stores_per_loop = (1 << (table_bits - codeword_len)); diff --git a/src/lz_hash_chains.c b/src/lz_hash_chains.c index 7e3e10b8..c116f125 100644 --- a/src/lz_hash_chains.c +++ b/src/lz_hash_chains.c @@ -138,6 +138,7 @@ lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[]) struct lz_match *lz_matchptr = matches; u32 hash; u32 cur_match; + u32 sequence; if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1)) return 0; @@ -159,6 +160,9 @@ lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[]) if (unlikely(best_len >= max_len)) return 0; + if (UNALIGNED_ACCESS_IS_FAST) + sequence = load_u24_unaligned(strptr); + /* Search the appropriate hash chain for matches. */ for (; cur_match && depth_remaining--; cur_match = prev_tab[cur_match]) { @@ -173,60 +177,60 @@ lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[]) if (matchptr[best_len] != strptr[best_len]) goto next_match; - #if HAVE_FAST_LZ_EXTEND - if ((*(const u32 *)strptr & 0xFFFFFF) != - (*(const u32 *)matchptr & 0xFFFFFF)) - goto next_match; + if (UNALIGNED_ACCESS_IS_FAST) { + if (load_u24_unaligned(matchptr) != sequence) + goto next_match; + + len = lz_extend(strptr, matchptr, 3, max_len); + + if (len > best_len) { + best_len = len; + + *lz_matchptr++ = (struct lz_match) { + .len = best_len, + .offset = strptr - matchptr, + }; + + if (best_len >= nice_len) + break; + } + } else { + + /* The bytes at indices 'best_len - 1' and '0' are less + * important to check separately. But doing so still + * gives a slight performance improvement, at least on + * x86_64, probably because they create separate + * branches for the CPU to predict independently of the + * branches in the main comparison loops. + */ + if (matchptr[best_len - 1] != strptr[best_len - 1] || + matchptr[0] != strptr[0]) + goto next_match; + + for (len = 1; len < best_len - 1; len++) + if (matchptr[len] != strptr[len]) + goto next_match; - len = lz_extend(strptr, matchptr, 3, max_len); + /* The match is the longest found so far --- at least + * 'best_len' + 1 bytes. Continue extending it. */ - if (len > best_len) { - best_len = len; + if (++best_len != max_len && + strptr[best_len] == matchptr[best_len]) + while (++best_len != max_len) + if (strptr[best_len] != matchptr[best_len]) + break; + /* Record the match. */ *lz_matchptr++ = (struct lz_match) { .len = best_len, .offset = strptr - matchptr, }; + /* Terminate the search if 'nice_len' was reached. */ if (best_len >= nice_len) break; } - #else /* HAVE_FAST_LZ_EXTEND */ - - /* The bytes at indices 'best_len - 1' and '0' are less - * important to check separately. But doing so still gives a - * slight performance improvement, at least on x86_64, probably - * because they create separate branches for the CPU to predict - * independently of the branches in the main comparison loops. - */ - if (matchptr[best_len - 1] != strptr[best_len - 1] || - matchptr[0] != strptr[0]) - goto next_match; - - for (len = 1; len < best_len - 1; len++) - if (matchptr[len] != strptr[len]) - goto next_match; - - /* The match is the longest found so far --- - * at least 'best_len' + 1 bytes. Continue extending it. */ - - if (++best_len != max_len && strptr[best_len] == matchptr[best_len]) - while (++best_len != max_len) - if (strptr[best_len] != matchptr[best_len]) - break; - - /* Record the match. */ - *lz_matchptr++ = (struct lz_match) { - .len = best_len, - .offset = strptr - matchptr, - }; - - /* Terminate the search if 'nice_len' was reached. */ - if (best_len >= nice_len) - break; - #endif /* !HAVE_FAST_LZ_EXTEND */ - next_match: /* Continue to next match in the chain. */ ; diff --git a/src/lzms-common.c b/src/lzms-common.c index 95682d53..41383f4c 100644 --- a/src/lzms-common.c +++ b/src/lzms-common.c @@ -26,6 +26,7 @@ # include "config.h" #endif +#include "wimlib/bitops.h" #include "wimlib/endianness.h" #include "wimlib/lzms.h" #include "wimlib/unaligned.h" @@ -95,7 +96,7 @@ lzms_decode_delta_rle_slot_bases(u32 slot_bases[], LZMS_ASSERT(slot == expected_num_slots); slot_bases[slot] = final; - extra_bits[slot - 1] = bsr32(slot_bases[slot] - slot_bases[slot - 1]); + extra_bits[slot - 1] = fls32(slot_bases[slot] - slot_bases[slot - 1]); } /* Initialize the global offset and length slot tables. */ @@ -159,19 +160,19 @@ lzms_maybe_do_x86_translation(u8 data[restrict], s32 i, s32 num_op_bytes, LZMS_DEBUG("Undid x86 translation at position %d " "(opcode 0x%02x)", i, data[i]); void *p32 = &data[i + num_op_bytes]; - u32 n = le32_to_cpu(load_le32_unaligned(p32)); - store_le32_unaligned(cpu_to_le32(n - i), p32); + u32 n = get_unaligned_u32_le(p32); + put_unaligned_u32_le(n - i, p32); } - pos = i + le16_to_cpu(load_le16_unaligned(&data[i + num_op_bytes])); + pos = i + get_unaligned_u16_le(&data[i + num_op_bytes]); } else { - pos = i + le16_to_cpu(load_le16_unaligned(&data[i + num_op_bytes])); + pos = i + get_unaligned_u16_le(&data[i + num_op_bytes]); if (i - *closest_target_usage_p <= max_trans_offset) { LZMS_DEBUG("Did x86 translation at position %d " "(opcode 0x%02x)", i, data[i]); void *p32 = &data[i + num_op_bytes]; - u32 n = le32_to_cpu(load_le32_unaligned(p32)); - store_le32_unaligned(cpu_to_le32(n + i), p32); + u32 n = get_unaligned_u32_le(p32); + put_unaligned_u32_le(n + i, p32); } } diff --git a/src/lzms-compress.c b/src/lzms-compress.c index da1904fc..3686db0a 100644 --- a/src/lzms-compress.c +++ b/src/lzms-compress.c @@ -32,6 +32,7 @@ #include "wimlib/lz_mf.h" #include "wimlib/lz_repsearch.h" #include "wimlib/lzms.h" +#include "wimlib/unaligned.h" #include "wimlib/util.h" #include @@ -338,7 +339,7 @@ lzms_output_bitstream_put_varbits(struct lzms_output_bitstream *os, /* Write a coding unit, unless it would underflow the buffer. */ if (os->next != os->begin) - *--os->next = cpu_to_le16(os->bitbuf >> os->bitcount); + put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next); /* Optimization for call sites that never write more than 16 * bits at once. */ @@ -357,7 +358,7 @@ lzms_output_bitstream_flush(struct lzms_output_bitstream *os) return false; if (os->bitcount != 0) - *--os->next = cpu_to_le16(os->bitbuf << (16 - os->bitcount)); + put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next); return true; } @@ -401,9 +402,11 @@ lzms_range_encoder_raw_shift_low(struct lzms_range_encoder_raw *rc) * ((rc->low >> 32) != 0, a.k.a. the carry bit is 1). */ do { if (likely(rc->next >= rc->begin)) { - if (rc->next != rc->end) - *rc->next++ = cpu_to_le16(rc->cache + - (u16)(rc->low >> 32)); + if (rc->next != rc->end) { + put_unaligned_u16_le(rc->cache + + (u16)(rc->low >> 32), + rc->next++); + } } else { rc->next++; } diff --git a/src/lzms-decompress.c b/src/lzms-decompress.c index 43cd9acb..96c501aa 100644 --- a/src/lzms-decompress.c +++ b/src/lzms-decompress.c @@ -209,6 +209,7 @@ #include "wimlib/decompress_common.h" #include "wimlib/error.h" #include "wimlib/lzms.h" +#include "wimlib/unaligned.h" #include "wimlib/util.h" #include @@ -393,7 +394,7 @@ lzms_input_bitstream_ensure_bits(struct lzms_input_bitstream *is, if (unlikely(is->num_le16_remaining == 0)) return -1; - next = le16_to_cpu(*--is->in); + next = get_unaligned_u16_le(--is->in); is->num_le16_remaining--; is->bitbuf |= next << (sizeof(is->bitbuf) * 8 - is->num_filled_bits - 16); @@ -450,8 +451,8 @@ lzms_range_decoder_raw_init(struct lzms_range_decoder_raw *rd, const le16 *in, size_t in_limit) { rd->range = 0xffffffff; - rd->code = ((u32)le16_to_cpu(in[0]) << 16) | - ((u32)le16_to_cpu(in[1]) << 0); + rd->code = ((u32)get_unaligned_u16_le(&in[0]) << 16) | + ((u32)get_unaligned_u16_le(&in[1]) << 0); rd->in = in + 2; rd->num_le16_remaining = in_limit - 2; } @@ -465,7 +466,7 @@ lzms_range_decoder_raw_normalize(struct lzms_range_decoder_raw *rd) rd->range <<= 16; if (unlikely(rd->num_le16_remaining == 0)) return -1; - rd->code = (rd->code << 16) | le16_to_cpu(*rd->in++); + rd->code = (rd->code << 16) | get_unaligned_u16_le(rd->in++); rd->num_le16_remaining--; } return 0; @@ -667,7 +668,7 @@ lzms_copy_lz_match(struct lzms_decompressor *ctx, u32 length, u32 offset) out_next = ctx->out_next; - lz_copy(out_next, length, offset, ctx->out_end); + lz_copy(out_next, length, offset, ctx->out_end, 1); ctx->out_next = out_next + length; return 0; diff --git a/src/lzx-common.c b/src/lzx-common.c index 9f55f171..1221f61e 100644 --- a/src/lzx-common.c +++ b/src/lzx-common.c @@ -23,6 +23,7 @@ # include "config.h" #endif +#include "wimlib/bitops.h" #include "wimlib/endianness.h" #include "wimlib/lzx.h" #include "wimlib/unaligned.h" @@ -75,7 +76,7 @@ lzx_get_window_order(size_t max_block_size) if (max_block_size == 0 || max_block_size > LZX_MAX_WINDOW_SIZE) return 0; - order = bsr32(max_block_size); + order = fls32(max_block_size); if (((u32)1 << order) != max_block_size) order++; @@ -125,7 +126,7 @@ do_translate_target(void *target, s32 input_pos) { s32 abs_offset, rel_offset; - rel_offset = le32_to_cpu(load_le32_unaligned(target)); + rel_offset = get_unaligned_u32_le(target); if (rel_offset >= -input_pos && rel_offset < LZX_WIM_MAGIC_FILESIZE) { if (rel_offset < LZX_WIM_MAGIC_FILESIZE - input_pos) { /* "good translation" */ @@ -134,7 +135,7 @@ do_translate_target(void *target, s32 input_pos) /* "compensating translation" */ abs_offset = rel_offset - LZX_WIM_MAGIC_FILESIZE; } - store_le32_unaligned(cpu_to_le32(abs_offset), target); + put_unaligned_u32_le(abs_offset, target); } } @@ -143,20 +144,18 @@ undo_translate_target(void *target, s32 input_pos) { s32 abs_offset, rel_offset; - abs_offset = le32_to_cpu(load_le32_unaligned(target)); + abs_offset = get_unaligned_u32_le(target); if (abs_offset >= 0) { if (abs_offset < LZX_WIM_MAGIC_FILESIZE) { /* "good translation" */ rel_offset = abs_offset - input_pos; - - store_le32_unaligned(cpu_to_le32(rel_offset), target); + put_unaligned_u32_le(rel_offset, target); } } else { if (abs_offset >= -input_pos) { /* "compensating translation" */ rel_offset = abs_offset + LZX_WIM_MAGIC_FILESIZE; - - store_le32_unaligned(cpu_to_le32(rel_offset), target); + put_unaligned_u32_le(rel_offset, target); } } } @@ -195,6 +194,7 @@ inline /* Although inlining the 'process_target' function still speeds up the void lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) { + u8 *p = data; #ifdef __SSE2__ /* SSE2 vectorized implementation for x86_64. This speeds up LZX * decompression by about 5-8% overall. (Usually --- the performance @@ -202,11 +202,11 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) * consists entirely of 0xe8 bytes. Also, this optimization affects * compression as well, but the percentage improvement is less because * LZX compression is much slower than LZX decompression. ) */ - __m128i *p128 = (__m128i *)data; - u32 valid_mask = 0xFFFFFFFF; + if (size >= 32 && (uintptr_t)p % 16 == 0) { + + u32 valid_mask = 0xFFFFFFFF; - if (size >= 32 && (uintptr_t)data % 16 == 0) { - __m128i * const end128 = p128 + size / 16 - 1; + u8 * const vec_end = p + (size & ~15) - 16; /* Create a vector of all 0xe8 bytes */ const __m128i e8_bytes = _mm_set1_epi8(0xe8); @@ -216,7 +216,8 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) /* Compare the current 16-byte vector with the vector of * all 0xe8 bytes. This produces 0xff where the byte is * 0xe8 and 0x00 where it is not. */ - __m128i cmpresult = _mm_cmpeq_epi8(*p128, e8_bytes); + __m128i cmpresult = _mm_cmpeq_epi8(*(const __m128i *)p, + e8_bytes); /* Map the comparison results into a single 16-bit * number. It will contain a 1 bit when the @@ -244,12 +245,11 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) * index of the byte, within the 16, at * which the next e8 translation should * be done. */ - u32 bit = __builtin_ctz(e8_mask); + int bit = ffs32(e8_mask); /* Do (or undo) the e8 translation. */ - u8 *p8 = (u8 *)p128 + bit; - (*process_target)(p8 + 1, - p8 - data); + (*process_target)(p + bit + 1, + p + bit - data); /* Don't start an e8 translation in the * next 4 bytes. */ @@ -260,30 +260,27 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) valid_mask >>= 16; valid_mask |= 0xFFFF0000; } - } while (++p128 < end128); - } + } while ((p += 16) < vec_end); - u8 *p8 = (u8 *)p128; - while (!(valid_mask & 1)) { - p8++; - valid_mask >>= 1; + while (!(valid_mask & 1)) { + p++; + valid_mask >>= 1; + } } -#else /* __SSE2__ */ - u8 *p8 = data; #endif /* !__SSE2__ */ if (size > 10) { /* Finish any bytes that weren't processed by the vectorized * implementation. */ - u8 *p8_end = data + size - 10; + u8 *end = data + size - 10; do { - if (*p8 == 0xe8) { - (*process_target)(p8 + 1, p8 - data); - p8 += 5; + if (*p == 0xe8) { + (*process_target)(p + 1, p - data); + p += 5; } else { - p8++; + p++; } - } while (p8 < p8_end); + } while (p < end); } } diff --git a/src/lzx-compress.c b/src/lzx-compress.c index 2cb35e3a..201a1cfb 100644 --- a/src/lzx-compress.c +++ b/src/lzx-compress.c @@ -357,7 +357,7 @@ lzx_write_varbits(struct lzx_output_bitstream *os, /* Write a coding unit, unless it would overflow the buffer. */ if (os->next != os->end) - *os->next++ = cpu_to_le16(os->bitbuf >> os->bitcount); + put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next++); /* If writing 17 bits, a second coding unit might need to be * written. But because 'max_num_bits' is a compile-time @@ -365,7 +365,7 @@ lzx_write_varbits(struct lzx_output_bitstream *os, * call sites. */ if (max_num_bits == 17 && os->bitcount == 16) { if (os->next != os->end) - *os->next++ = cpu_to_le16(os->bitbuf); + put_unaligned_u16_le(os->bitbuf, os->next++); os->bitcount = 0; } } @@ -391,7 +391,7 @@ lzx_flush_output(struct lzx_output_bitstream *os) return 0; if (os->bitcount != 0) - *os->next++ = cpu_to_le16(os->bitbuf << (16 - os->bitcount)); + put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next++); return (const u8 *)os->next - (const u8 *)os->start; } diff --git a/src/lzx-decompress.c b/src/lzx-decompress.c index 4c7694b6..2c466512 100644 --- a/src/lzx-decompress.c +++ b/src/lzx-decompress.c @@ -500,7 +500,8 @@ lzx_decompress_block(int block_type, u32 block_size, if (unlikely(match_offset > window_ptr - window)) return -1; - lz_copy(window_ptr, match_len, match_offset, window_end); + lz_copy(window_ptr, match_len, match_offset, window_end, + LZX_MIN_MATCH_LEN); window_ptr += match_len; } diff --git a/src/resource.c b/src/resource.c index fb5761d9..20e27e17 100644 --- a/src/resource.c +++ b/src/resource.c @@ -26,6 +26,7 @@ #endif #include "wimlib/assert.h" +#include "wimlib/bitops.h" #include "wimlib/endianness.h" #include "wimlib/error.h" #include "wimlib/file_io.h" @@ -214,7 +215,7 @@ read_compressed_wim_resource(const struct wim_resource_spec * const rspec, } } - const u32 chunk_order = bsr32(chunk_size); + const u32 chunk_order = fls32(chunk_size); /* Calculate the total number of chunks the resource is divided into. */ const u64 num_chunks = (rspec->uncompressed_size + chunk_size - 1) >> chunk_order; @@ -326,8 +327,8 @@ read_compressed_wim_resource(const struct wim_resource_spec * const rspec, /* Now fill in chunk_offsets from the entries we have read in * chunk_tab_data. We break aliasing rules here to avoid having * to allocate yet another array. */ - typedef le64 __attribute__((may_alias)) aliased_le64_t; - typedef le32 __attribute__((may_alias)) aliased_le32_t; + typedef le64 _may_alias_attribute aliased_le64_t; + typedef le32 _may_alias_attribute aliased_le32_t; u64 * chunk_offsets_p = chunk_offsets; if (alt_chunk_table) { diff --git a/src/util.c b/src/util.c index 6e9ae23e..3b13c503 100644 --- a/src/util.c +++ b/src/util.c @@ -23,12 +23,15 @@ # include "config.h" #endif +/* Make sure the POSIX-compatible strerror_r() is declared, rather than the GNU + * version, which has a different return type. */ #ifdef _GNU_SOURCE # define _GNU_SOURCE_DEFINED 1 # undef _GNU_SOURCE +# ifndef _POSIX_C_SOURCE +# define _POSIX_C_SOURCE 200112L +# endif #endif -/* Make sure the POSIX-compatible strerror_r() is declared, rather than the GNU - * version, which has a different return type. */ #include #ifdef _GNU_SOURCE_DEFINED # define _GNU_SOURCE diff --git a/src/wim.c b/src/wim.c index 8422d9e1..51fc1f04 100644 --- a/src/wim.c +++ b/src/wim.c @@ -24,6 +24,7 @@ #endif #include "wimlib.h" +#include "wimlib/bitops.h" #include "wimlib/dentry.h" #include "wimlib/encoding.h" #include "wimlib/file_io.h" @@ -96,7 +97,7 @@ wim_chunk_size_valid(u32 chunk_size, int ctype) /* Chunk size must be power of 2. */ if (chunk_size == 0) return false; - order = bsr32(chunk_size); + order = fls32(chunk_size); if (chunk_size != 1U << order) return false; diff --git a/src/write.c b/src/write.c index 908de9ad..d8ce2dc3 100644 --- a/src/write.c +++ b/src/write.c @@ -517,8 +517,8 @@ end_chunk_table(struct write_streams_ctx *ctx, u64 res_actual_size, 0 != (ctx->write_resource_flags & WRITE_RESOURCE_FLAG_PACK_STREAMS)); - typedef le64 __attribute__((may_alias)) aliased_le64_t; - typedef le32 __attribute__((may_alias)) aliased_le32_t; + typedef le64 _may_alias_attribute aliased_le64_t; + typedef le32 _may_alias_attribute aliased_le32_t; if (chunk_entry_size == 4) { aliased_le32_t *entries = (aliased_le32_t*)ctx->chunk_csizes; diff --git a/src/xpress-compress.c b/src/xpress-compress.c index d59de36a..fdcc2fca 100644 --- a/src/xpress-compress.c +++ b/src/xpress-compress.c @@ -26,11 +26,13 @@ # include "config.h" #endif +#include "wimlib/bitops.h" #include "wimlib/compress_common.h" #include "wimlib/compressor_ops.h" #include "wimlib/endianness.h" #include "wimlib/error.h" #include "wimlib/lz_mf.h" +#include "wimlib/unaligned.h" #include "wimlib/util.h" #include "wimlib/xpress.h" @@ -216,7 +218,7 @@ xpress_write_bits(struct xpress_output_bitstream *os, if (os->bitcount > 16) { os->bitcount -= 16; if (os->end - os->next_byte >= 2) { - *(le16 *)os->next_bits = cpu_to_le16(os->bitbuf >> os->bitcount); + put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next_bits); os->next_bits = os->next_bits2; os->next_bits2 = os->next_byte; os->next_byte += 2; @@ -244,8 +246,8 @@ xpress_flush_output(struct xpress_output_bitstream *os) if (unlikely(os->end - os->next_byte < 2)) return 0; - *(le16 *)os->next_bits = cpu_to_le16(os->bitbuf << (16 - os->bitcount)); - *(le16 *)os->next_bits2 = cpu_to_le16(0); + put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next_bits); + put_unaligned_u16_le(0, os->next_bits2); return os->next_byte - os->start; } @@ -332,8 +334,8 @@ xpress_declare_match(struct xpress_compressor *c, { unsigned adjusted_len = len - XPRESS_MIN_MATCH_LEN; unsigned len_hdr = min(adjusted_len, 0xf); - unsigned offset_bsr = bsr32(offset); - unsigned sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr); + unsigned offset_high_bit = fls32(offset); + unsigned sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr); c->freqs[sym]++; @@ -341,8 +343,8 @@ xpress_declare_match(struct xpress_compressor *c, *(*next_chosen_item)++ = (struct xpress_item) { .data = (u64)sym | ((u64)adjusted_len << 9) | - ((u64)offset_bsr << 25) | - ((u64)(offset ^ (1U << offset_bsr)) << 29), + ((u64)offset_high_bit << 25) | + ((u64)(offset ^ (1U << offset_high_bit)) << 29), }; } } @@ -605,7 +607,7 @@ xpress_consider_matches(struct xpress_compressor *c, u32 cost; u32 position_cost; unsigned offset; - unsigned offset_bsr; + unsigned offset_high_bit; unsigned adjusted_len; unsigned len_hdr; unsigned sym; @@ -614,11 +616,11 @@ xpress_consider_matches(struct xpress_compressor *c, /* All lengths are small. Optimize accordingly. */ do { offset = matches[i].offset; - offset_bsr = bsr32(offset); + offset_high_bit = fls32(offset); len_hdr = len - XPRESS_MIN_MATCH_LEN; - sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr); + sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr); - position_cost = cur_optimum_ptr->cost + offset_bsr; + position_cost = cur_optimum_ptr->cost + offset_high_bit; do { cost = position_cost + c->costs[sym]; if (cost < (cur_optimum_ptr + len)->cost) { @@ -633,12 +635,12 @@ xpress_consider_matches(struct xpress_compressor *c, /* Some lengths are big. */ do { offset = matches[i].offset; - offset_bsr = bsr32(offset); - position_cost = cur_optimum_ptr->cost + offset_bsr; + offset_high_bit = fls32(offset); + position_cost = cur_optimum_ptr->cost + offset_high_bit; do { adjusted_len = len - XPRESS_MIN_MATCH_LEN; len_hdr = min(adjusted_len, 0xf); - sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr); + sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr); cost = position_cost + c->costs[sym]; if (adjusted_len >= 0xf) { diff --git a/src/xpress-decompress.c b/src/xpress-decompress.c index 123613d6..b43e5273 100644 --- a/src/xpress-decompress.c +++ b/src/xpress-decompress.c @@ -84,7 +84,7 @@ xpress_decode_window(struct input_bitstream *istream, const u16 *decode_table, u8 *window_end = &window[window_size]; unsigned sym; unsigned match_len; - unsigned offset_bsr; + unsigned offset_high_bit; unsigned match_offset; while (window_ptr != window_end) { @@ -99,12 +99,12 @@ xpress_decode_window(struct input_bitstream *istream, const u16 *decode_table, /* Match */ match_len = sym & 0xf; - offset_bsr = (sym >> 4) & 0xf; + offset_high_bit = (sym >> 4) & 0xf; bitstream_ensure_bits(istream, 16); - match_offset = (1 << offset_bsr) | - bitstream_pop_bits(istream, offset_bsr); + match_offset = (1 << offset_high_bit) | + bitstream_pop_bits(istream, offset_high_bit); if (match_len == 0xf) { match_len += bitstream_read_byte(istream); @@ -119,7 +119,8 @@ xpress_decode_window(struct input_bitstream *istream, const u16 *decode_table, if (unlikely(match_len > window_end - window_ptr)) return -1; - lz_copy(window_ptr, match_len, match_offset, window_end); + lz_copy(window_ptr, match_len, match_offset, window_end, + XPRESS_MIN_MATCH_LEN); window_ptr += match_len; } -- 2.43.0