AM_CPPFLAGS = -I$(top_srcdir)/include $(WINDOWS_CPPFLAGS) \
-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
-AM_CFLAGS = -std=gnu99 -Wmissing-prototypes -Wstrict-prototypes \
+AM_CFLAGS = -std=c99 -Wmissing-prototypes -Wstrict-prototypes \
-Werror-implicit-function-declaration \
-fno-common -Wundef -Wno-pointer-sign
include/wimlib/apply.h \
include/wimlib/assert.h \
include/wimlib/avl_tree.h \
+ include/wimlib/bitops.h \
include/wimlib/callback.h \
include/wimlib/capture.h \
include/wimlib/case.h \
include/wimlib/compiler.h \
+ include/wimlib/compiler-gcc.h \
include/wimlib/compressor_ops.h \
include/wimlib/compress_common.h \
include/wimlib/chunk_compressor.h \
--- /dev/null
+/*
+ * bitops.h
+ *
+ * Inline functions for bit manipulation.
+ *
+ * The author dedicates this file to the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#ifndef _WIMLIB_BITOPS_H
+#define _WIMLIB_BITOPS_H
+
+#include "wimlib/compiler.h"
+#include "wimlib/types.h"
+
+/* Find Last Set bit */
+
+static inline unsigned
+fls32(u32 v)
+{
+#ifdef compiler_fls32
+ return compiler_fls32(v);
+#else
+ unsigned bit = 0;
+ while ((v >>= 1) != 0)
+ bit++;
+ return bit;
+#endif
+}
+
+static inline unsigned
+fls64(u64 v)
+{
+#ifdef compiler_fls64
+ return compiler_fls64(v);
+#else
+ unsigned bit = 0;
+ while ((v >>= 1) != 0)
+ bit++;
+ return bit;
+#endif
+}
+
+static inline unsigned
+flsw(machine_word_t v)
+{
+ BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
+ if (WORDSIZE == 4)
+ return fls32(v);
+ else
+ return fls64(v);
+}
+
+/* Find First Set bit */
+
+static inline unsigned
+ffs32(u32 v)
+{
+#ifdef compiler_ffs32
+ return compiler_ffs32(v);
+#else
+ unsigned bit;
+ for (bit = 0; !(v & 1); bit++, v >>= 1)
+ ;
+ return bit;
+#endif
+}
+
+static inline unsigned
+ffs64(u64 v)
+{
+#ifdef compiler_ffs64
+ return compiler_ffs64(v);
+#else
+ unsigned bit;
+ for (bit = 0; !(v & 1); bit++, v >>= 1)
+ ;
+ return bit;
+#endif
+}
+
+static inline unsigned
+ffsw(machine_word_t v)
+{
+ BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
+ if (WORDSIZE == 4)
+ return ffs32(v);
+ else
+ return ffs64(v);
+}
+
+#endif /* _WIMLIB_BITOPS_H */
--- /dev/null
+/*
+ * compiler-gcc.h
+ *
+ * The author dedicates this file to the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#ifndef _WIMLIB_COMPILER_GCC_H
+#define _WIMLIB_COMPILER_GCC_H
+
+#ifdef __WIN32__
+# define WIMLIBAPI __declspec(dllexport)
+#else
+# define WIMLIBAPI __attribute__((visibility("default")))
+#endif
+
+#define _packed_attribute __attribute__((packed))
+#define _aligned_attribute(n) __attribute__((aligned(n)))
+#define _may_alias_attribute __attribute__((may_alias))
+#define likely(expr) __builtin_expect(!!(expr), 1)
+#define unlikely(expr) __builtin_expect(!!(expr), 0)
+#define prefetch(addr) __builtin_prefetch(addr)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
+# define _cold_attribute __attribute__((cold))
+#endif
+#define _malloc_attribute __attribute__((malloc))
+#define inline inline __attribute__((always_inline))
+
+#define CPU_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+
+#if defined(__x86_64__) || defined(__i386__)
+# define UNALIGNED_ACCESS_SPEED 3
+#elif defined(__ARM_FEATURE_UNALIGNED) && (__ARM_FEATURE_UNALIGNED == 1)
+# define UNALIGNED_ACCESS_SPEED 2
+#else
+# define UNALIGNED_ACCESS_SPEED 0
+#endif
+
+#define typeof __typeof__
+
+#ifndef min
+# define min(a, b) ({ typeof(a) _a = (a); typeof(b) _b = (b); \
+ (_a < _b) ? _a : _b; })
+#endif
+
+#ifndef max
+# define max(a, b) ({ typeof(a) _a = (a); typeof(b) _b = (b); \
+ (_a > _b) ? _a : _b; })
+#endif
+
+#ifndef swap
+# define swap(a, b) ({ typeof(a) _a = (a); (a) = (b); (b) = _a; })
+#endif
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
+# define compiler_bswap32 __builtin_bswap32
+# define compiler_bswap64 __builtin_bswap64
+#endif
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+# define compiler_bswap16 __builtin_bswap16
+#endif
+
+#define compiler_fls32(n) (31 - __builtin_clz(n))
+#define compiler_fls64(n) (63 - __builtin_clzll(n))
+#define compiler_ffs32(n) __builtin_ctz(n)
+#define compiler_ffs64(n) __builtin_ctzll(n)
+
+#endif /* _WIMLIB_COMPILER_GCC_H */
#define _WIMLIB_COMPILER_H
#ifdef __GNUC__
-# if defined(__CYGWIN__) || defined(__WIN32__)
-# define WIMLIBAPI __declspec(dllexport)
-# else
-# define WIMLIBAPI __attribute__((visibility("default")))
-# endif
-# define _always_inline_attribute inline __attribute__((always_inline))
-# define _no_inline_attribute __attribute__((noinline))
-# define _packed_attribute __attribute__((packed))
-# define _format_attribute(type, format_str, args_start) \
- /*__attribute__((format(type, format_str, args_start))) */
-# if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
-# define _cold_attribute __attribute__((cold))
-# else
-# define _cold_attribute
-# endif
-# define _malloc_attribute __attribute__((malloc))
-# define _warn_unused_result_attribute __attribute__((warn_unused_result))
-# define _aligned_attribute(size) __attribute__((aligned(size)))
-# define likely(x) __builtin_expect(!!(x), 1)
-# define unlikely(x) __builtin_expect(!!(x), 0)
-# define inline inline __attribute__((always_inline))
-# define prefetch(x) __builtin_prefetch(x)
-# define is_constant(x) __builtin_constant_p(x)
+# include "wimlib/compiler-gcc.h"
#else
-# define WIMLIBAPI
-# define _always_inline_attribute inline
-# define _no_inline_attribute
-# define _format_attribute(type, format_str, args_start)
-# define _cold_attribute
-# define _packed_attribute
-# define _malloc_attribute
-# define _warn_unused_result_attribute
-# define _aligned_attribute(size)
-# define likely(x) (x)
-# define unlikely(x) (x)
-# define prefetch(x)
-# define is_constant(x) (0)
-#endif /* __GNUC__ */
+# error "Unrecognized compiler. Please add a header file for your compiler."
+#endif
+
+#ifndef WIMLIBAPI
+# define WIMLIBAPI
+#endif
+
+#ifndef _packed_attribute
+# error "missing required definition of _packed_attribute"
+#endif
+
+#ifndef _aligned_attribute
+# error "missing required definition of _aligned_attribute"
+#endif
+
+#ifndef _may_alias_attribute
+# error "missing required definition of _may_alias_attribute"
+#endif
+
+#ifndef likely
+# define likely(expr) (expr)
+#endif
+
+#ifndef unlikely
+# define unlikely(expr) (expr)
+#endif
+
+#ifndef prefetch
+# define prefetch(addr)
+#endif
+
+#ifndef _cold_attribute
+# define _cold_attribute
+#endif
+
+#ifndef _malloc_attribute
+# define _malloc_attribute
+#endif
+
+#ifndef _format_attribute
+# define _format_attribute(type, format_str, format_start)
+#endif
+
+#ifndef CPU_IS_BIG_ENDIAN
+# error "missing required definition of CPU_IS_BIG_ENDIAN"
+#endif
+
+#define CPU_IS_LITTLE_ENDIAN (!CPU_IS_BIG_ENDIAN)
+
+#ifndef UNALIGNED_ACCESS_SPEED
+# define UNALIGNED_ACCESS_SPEED 0
+#endif
+
+#define UNALIGNED_ACCESS_IS_ALLOWED (UNALIGNED_ACCESS_SPEED >= 1)
+#define UNALIGNED_ACCESS_IS_FAST (UNALIGNED_ACCESS_SPEED >= 2)
+#define UNALIGNED_ACCESS_IS_VERY_FAST (UNALIGNED_ACCESS_SPEED >= 3)
+
+#ifndef typeof
+# error "missing required definition of typeof"
+#endif
+
+#if !defined(min) || !defined(max) || !defined(swap)
+# error "missing required definitions of min(), max(), and swap() macros"
+#endif
#ifdef __CHECKER__
# define _bitwise_attr __attribute__((bitwise))
# define _force_attr
#endif
+#ifndef BUILD_BUG_ON
+# define BUILD_BUG_ON(expr) ((void)sizeof(char[1 - 2*!!(expr)]))
+#endif
+
#endif /* _WIMLIB_COMPILER_H */
#include "wimlib/compiler.h"
#include "wimlib/endianness.h"
#include "wimlib/types.h"
+#include "wimlib/unaligned.h"
/* Structure that encapsulates a block of in-memory data being interpreted as a
* stream of bits, optionally with interwoven literal bytes. Bits are assumed
if (unlikely(is->end - is->next < 2))
goto overflow;
- is->bitbuf |= (u32)le16_to_cpu(*(const le16 *)is->next)
- << (16 - is->bitsleft);
+ is->bitbuf |= (u32)get_unaligned_u16_le(is->next) << (16 - is->bitsleft);
is->next += 2;
is->bitsleft += 16;
if (unlikely(is->end - is->next < 2))
goto overflow;
- is->bitbuf |= (u32)le16_to_cpu(*(const le16 *)is->next);
+ is->bitbuf |= (u32)get_unaligned_u16_le(is->next);
is->next += 2;
is->bitsleft = 32;
}
if (unlikely(is->end - is->next < 2))
return 0;
- v = le16_to_cpu(*(const le16 *)is->next);
+ v = get_unaligned_u16_le(is->next);
is->next += 2;
return v;
}
if (unlikely(is->end - is->next < 4))
return 0;
- v = le32_to_cpu(*(const le32 *)is->next);
+ v = get_unaligned_u32_le(is->next);
is->next += 4;
return v;
}
/*
- * Copy a LZ77 match at (dst - offset) to dst.
+ * Copy an LZ77 match at (dst - offset) to dst.
*
* The length and offset must be already validated --- that is, (dst - offset)
* can't underrun the output buffer, and (dst + length) can't overrun the output
* This function won't write any data beyond this position.
*/
static inline void
-lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend)
+lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend, u32 min_length)
{
const u8 *src = dst - offset;
-#if defined(__x86_64__) || defined(__i386__)
- /* Copy one 'unsigned long' at a time. On i386 and x86_64 this is
+ const u8 * const end = dst + length;
+
+ /*
+ * Try to copy one machine word at a time. On i386 and x86_64 this is
* faster than copying one byte at a time, unless the data is
* near-random and all the matches have very short lengths. Note that
* since this requires unaligned memory accesses, it won't necessarily
* be faster on every architecture.
*
* Also note that we might copy more than the length of the match. For
- * example, if an 'unsigned long' is 8 bytes and the match is of length
- * 5, then we'll simply copy 8 bytes. This is okay as long as we don't
- * write beyond the end of the output buffer, hence the check for
- * (winend - (dst + length) >= sizeof(unsigned long) - 1). */
- if (offset >= sizeof(unsigned long) &&
- winend - (dst + length) >= sizeof(unsigned long) - 1)
+ * example, if a word is 8 bytes and the match is of length 5, then
+ * we'll simply copy 8 bytes. This is okay as long as we don't write
+ * beyond the end of the output buffer, hence the check for (winend -
+ * end >= WORDSIZE - 1).
+ */
+ if (UNALIGNED_ACCESS_IS_VERY_FAST &&
+ likely(winend - end >= WORDSIZE - 1))
{
- /* Access memory through a packed struct. This tricks the
- * compiler into allowing unaligned memory accesses. */
- struct ulong_wrapper {
- unsigned long v;
- } _packed_attribute;
-
- const u8 * const end = dst + length;
- unsigned long v;
-
- v = ((struct ulong_wrapper *)src)->v;
- ((struct ulong_wrapper *)dst)->v = v;
- dst += sizeof(unsigned long);
- src += sizeof(unsigned long);
- if (dst < end) {
+ if (offset >= WORDSIZE) {
+ /* The source and destination words don't overlap. */
+
+ /* To improve branch prediction, one iteration of this
+ * loop is unrolled. Most matches are short and will
+ * fail the first check. But if that check passes, then
+ * it becomes increasing likely that the match is long
+ * and we'll need to continue copying. */
+
+ copy_word_unaligned(src, dst);
+ src += WORDSIZE;
+ dst += WORDSIZE;
+
+ if (dst < end) {
+ do {
+ copy_word_unaligned(src, dst);
+ src += WORDSIZE;
+ dst += WORDSIZE;
+ } while (dst < end);
+ }
+ return;
+ } else if (offset == 1) {
+
+ /* Offset 1 matches are equivalent to run-length
+ * encoding of the previous byte. This case is common
+ * if the data contains many repeated bytes. */
+
+ machine_word_t v = repeat_byte(*(dst - 1));
do {
- v = ((struct ulong_wrapper *)src)->v;
- ((struct ulong_wrapper *)dst)->v = v;
- dst += sizeof(unsigned long);
- src += sizeof(unsigned long);
+ store_word_unaligned(v, dst);
+ src += WORDSIZE;
+ dst += WORDSIZE;
} while (dst < end);
+ return;
}
+ /*
+ * We don't bother with special cases for other 'offset <
+ * WORDSIZE', which are usually rarer than 'offset == 1'. Extra
+ * checks will just slow things down. Actually, it's possible
+ * to handle all the 'offset < WORDSIZE' cases using the same
+ * code, but it still becomes more complicated doesn't seem any
+ * faster overall; it definitely slows down the more common
+ * 'offset == 1' case.
+ */
+ }
- return;
+ /* Fall back to a bytewise copy. */
+
+ if (min_length >= 2) {
+ *dst++ = *src++;
+ length--;
+ }
+ if (min_length >= 3) {
+ *dst++ = *src++;
+ length--;
+ }
+ if (min_length >= 4) {
+ *dst++ = *src++;
+ length--;
}
-#endif
do {
*dst++ = *src++;
} while (--length);
+/*
+ * endianness.h
+ *
+ * Macros and inline functions for endianness conversion.
+ *
+ * The author dedicates this file to the public domain.
+ * You can do whatever you want with this file.
+ */
+
#ifndef _WIMLIB_ENDIANNESS_H
#define _WIMLIB_ENDIANNESS_H
+#include "wimlib/compiler.h"
#include "wimlib/types.h"
-/* Watch out for conflict with ntfs-3g/endian.h ... */
-
+/* Watch out for conflict with ntfs-3g/endians.h ... */
#ifndef _NTFS_ENDIANS_H
-static inline u16
-bswap16(u16 n)
+static inline u16 bswap16(u16 n)
{
+#ifdef compiler_bswap16
+ return compiler_bswap16(n);
+#else
return (n << 8) | (n >> 8);
+#endif
}
-static inline u32
-bswap32(u32 n)
+static inline u32 bswap32(u32 n)
{
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
- return __builtin_bswap32(n);
+#ifdef compiler_bswap32
+ return compiler_bswap32(n);
#else
- return (n << 24) | ((n & 0xff00) << 8) | ((n & 0xff0000) >> 8) |
- (n >> 24);
+ return (n << 24) |
+ ((n & 0xFF00) << 8) |
+ ((n & 0xFF0000) >> 8) |
+ (n >> 24);
#endif
}
-static inline u64
-bswap64(u64 n)
+static inline u64 bswap64(u64 n)
{
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
- return __builtin_bswap64(n);
+#ifdef compiler_bswap64
+ return compiler_bswap64(n);
#else
- return (n << 56) | ((n & 0xff00) << 40) | ((n & 0xff0000) << 24) |
- ((n & 0xff000000) << 8) | ((n & 0xff00000000) >> 8) |
- ((n & 0xff0000000000) >> 24) |
- ((n & 0xff000000000000) >> 40) | (n >> 56);
+ return (n << 56) |
+ ((n & 0xFF00) << 40) |
+ ((n & 0xFF0000) << 24) |
+ ((n & 0xFF000000) << 8) |
+ ((n & 0xFF00000000) >> 8) |
+ ((n & 0xFF0000000000) >> 24) |
+ ((n & 0xFF000000000000) >> 40) |
+ (n >> 56);
#endif
}
-# ifdef WORDS_BIGENDIAN
-# define cpu_to_le16(n) ((_force_attr le16)bswap16(n))
-# define cpu_to_le32(n) ((_force_attr le32)bswap32(n))
-# define cpu_to_le64(n) ((_force_attr le64)bswap64(n))
-# define le16_to_cpu(n) bswap16((_force_attr u16)(le16)(n))
-# define le32_to_cpu(n) bswap32((_force_attr u32)(le32)(n))
-# define le64_to_cpu(n) bswap64((_force_attr u64)(le64)(n))
-
-# define cpu_to_be16(n) ((_force_attr be16)(u16)(n))
-# define cpu_to_be32(n) ((_force_attr be32)(u32)(n))
-# define cpu_to_be64(n) ((_force_attr be64)(u64)(n))
-# define be16_to_cpu(n) ((_force_attr u16)(be16)(n))
-# define be32_to_cpu(n) ((_force_attr u32)(be32)(n))
-# define be64_to_cpu(n) ((_force_attr u64)(be64)(n))
-# else
-# define cpu_to_le16(n) ((_force_attr le16)(u16)(n))
-# define cpu_to_le32(n) ((_force_attr le32)(u32)(n))
-# define cpu_to_le64(n) ((_force_attr le64)(u64)(n))
-# define le16_to_cpu(n) ((_force_attr u16)(le16)(n))
-# define le32_to_cpu(n) ((_force_attr u32)(le32)(n))
-# define le64_to_cpu(n) ((_force_attr u64)(le64)(n))
+#if CPU_IS_BIG_ENDIAN
+# define cpu_to_le16(n) ((_force_attr le16)bswap16(n))
+# define cpu_to_le32(n) ((_force_attr le32)bswap32(n))
+# define cpu_to_le64(n) ((_force_attr le64)bswap64(n))
+# define le16_to_cpu(n) bswap16((_force_attr u16)(le16)(n))
+# define le32_to_cpu(n) bswap32((_force_attr u32)(le32)(n))
+# define le64_to_cpu(n) bswap64((_force_attr u64)(le64)(n))
+# define cpu_to_be16(n) ((_force_attr be16)(u16)(n))
+# define cpu_to_be32(n) ((_force_attr be32)(u32)(n))
+# define cpu_to_be64(n) ((_force_attr be64)(u64)(n))
+# define be16_to_cpu(n) ((_force_attr u16)(be16)(n))
+# define be32_to_cpu(n) ((_force_attr u32)(be32)(n))
+# define be64_to_cpu(n) ((_force_attr u64)(be64)(n))
+#else
+# define cpu_to_le16(n) ((_force_attr le16)(u16)(n))
+# define cpu_to_le32(n) ((_force_attr le32)(u32)(n))
+# define cpu_to_le64(n) ((_force_attr le64)(u64)(n))
+# define le16_to_cpu(n) ((_force_attr u16)(le16)(n))
+# define le32_to_cpu(n) ((_force_attr u32)(le32)(n))
+# define le64_to_cpu(n) ((_force_attr u64)(le64)(n))
+# define cpu_to_be16(n) ((_force_attr be16)bswap16(n))
+# define cpu_to_be32(n) ((_force_attr be32)bswap32(n))
+# define cpu_to_be64(n) ((_force_attr be64)bswap64(n))
+# define be16_to_cpu(n) bswap16((_force_attr u16)(be16)(n))
+# define be32_to_cpu(n) bswap32((_force_attr u32)(be32)(n))
+# define be64_to_cpu(n) bswap64((_force_attr u64)(be64)(n))
+#endif
-# define cpu_to_be16(n) ((_force_attr be16)bswap16(n))
-# define cpu_to_be32(n) ((_force_attr be32)bswap32(n))
-# define cpu_to_be64(n) ((_force_attr be64)bswap64(n))
-# define be16_to_cpu(n) bswap16((_force_attr u16)(be16)(n))
-# define be32_to_cpu(n) bswap32((_force_attr u32)(be32)(n))
-# define be64_to_cpu(n) bswap64((_force_attr u64)(be64)(n))
-# endif
#endif /* _NTFS_ENDIANS_H */
-
#endif /* _WIMLIB_ENDIANNESS_H */
#ifndef _WIMLIB_LZ_EXTEND_H
#define _WIMLIB_LZ_EXTEND_H
-#include "wimlib/types.h"
-
-#if (defined(__x86_64__) || defined(__i386__)) && defined(__GNUC__)
-# define HAVE_FAST_LZ_EXTEND 1
-#else
-# define HAVE_FAST_LZ_EXTEND 0
-#endif
+#include "wimlib/bitops.h"
+#include "wimlib/unaligned.h"
/* Return the number of bytes at @matchptr that match the bytes at @strptr, up
* to a maximum of @max_len. Initially, @start_len bytes are matched. */
{
u32 len = start_len;
-#if HAVE_FAST_LZ_EXTEND
-
- while (len + sizeof(unsigned long) <= max_len) {
- unsigned long x;
+ if (UNALIGNED_ACCESS_IS_FAST) {
- x = *(const unsigned long *)&matchptr[len] ^
- *(const unsigned long *)&strptr[len];
- if (x != 0)
- return len + (__builtin_ctzl(x) >> 3);
- len += sizeof(unsigned long);
- }
+ machine_word_t v_word;
- if (sizeof(unsigned int) < sizeof(unsigned long) &&
- len + sizeof(unsigned int) <= max_len)
- {
- unsigned int x;
+ if (likely(max_len - len >= 4 * WORDSIZE)) {
- x = *(const unsigned int *)&matchptr[len] ^
- *(const unsigned int *)&strptr[len];
- if (x != 0)
- return len + (__builtin_ctz(x) >> 3);
- len += sizeof(unsigned int);
- }
+ #define COMPARE_WORD_STEP \
+ v_word = load_word_unaligned(&matchptr[len]) ^ \
+ load_word_unaligned(&strptr[len]); \
+ if (v_word != 0) \
+ goto word_differs; \
+ len += WORDSIZE; \
- if (sizeof(unsigned int) == 4) {
- if (len < max_len && matchptr[len] == strptr[len]) {
- len++;
- if (len < max_len && matchptr[len] == strptr[len]) {
- len++;
- if (len < max_len && matchptr[len] == strptr[len]) {
- len++;
- }
- }
+ COMPARE_WORD_STEP
+ COMPARE_WORD_STEP
+ COMPARE_WORD_STEP
+ COMPARE_WORD_STEP
+ #undef COMPARE_WORD_STEP
}
- return len;
- }
-#endif /* HAVE_FAST_LZ_EXTEND */
+ while (len + WORDSIZE <= max_len) {
+ v_word = load_word_unaligned(&matchptr[len]) ^
+ load_word_unaligned(&strptr[len]);
+ if (v_word != 0)
+ goto word_differs;
+ len += WORDSIZE;
+ }
- while (len < max_len && matchptr[len] == strptr[len])
- len++;
+ while (len < max_len && matchptr[len] == strptr[len])
+ len++;
+ return len;
- return len;
+ word_differs:
+ return len + (ffsw(v_word) >> 3);
+ } else {
+ while (len < max_len && matchptr[len] == strptr[len])
+ len++;
+ return len;
+ }
}
#endif /* _WIMLIB_LZ_EXTEND_H */
#ifndef _WIMLIB_LZ_HASH3_H
#define _WIMLIB_LZ_HASH3_H
-#include "wimlib/types.h"
+#include "wimlib/unaligned.h"
/* Constant for the multiplicative hash function. */
#define LZ_HASH_MULTIPLIER 0x1E35A7BD
-/* Hash the next 3-byte sequence in the window, producing a hash of length
- * 'num_bits' bits. 4 bytes must be available, since 32-bit unaligned load is
- * faster on some architectures. */
static inline u32
-lz_hash(const u8 *p, unsigned int num_bits)
+loaded_u32_to_u24(u32 v)
{
- u32 str;
- u32 hash;
+ if (CPU_IS_LITTLE_ENDIAN)
+ return v & 0xFFFFFF;
+ else
+ return v >> 8;
+}
-#if defined(__i386__) || defined(__x86_64__)
- /* Unaligned access allowed, and little endian CPU.
- * Callers ensure that at least 4 (not 3) bytes are remaining. */
- str = *(const u32 *)p & 0xFFFFFF;
-#else
- str = ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
-#endif
+static inline u32
+load_u24_unaligned(const u8 *p)
+{
+ if (UNALIGNED_ACCESS_IS_FAST)
+ return loaded_u32_to_u24(load_u32_unaligned(p));
+ else
+ return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
+}
- hash = str * LZ_HASH_MULTIPLIER;
+static inline u32
+lz_hash_u24(u32 str, unsigned num_bits)
+{
+ return (u32)(str * LZ_HASH_MULTIPLIER) >> (32 - num_bits);
+}
- /* High bits are more random than the low bits. */
- return hash >> (32 - num_bits);
+/*
+ * Hash the next 3-byte sequence in the window, producing a hash of length
+ * 'num_bits' bits. At least LZ_HASH_REQUIRED_NBYTES must be available at 'p';
+ * this might be 4 bytes rather than 3 because an unaligned load is faster on
+ * some architectures.
+ */
+static inline u32
+lz_hash(const u8 *p, unsigned num_bits)
+{
+ return lz_hash_u24(load_u24_unaligned(p), num_bits);
}
/* The number of bytes being hashed. */
#define _LZ_REPSEARCH_H
#include "wimlib/lz_extend.h"
+#include "wimlib/unaligned.h"
extern u32
lz_extend_repmatch(const u8 *strptr, const u8 *matchptr, u32 max_len);
unsigned rep_max_idx;
u32 rep_len;
u32 rep_max_len;
- const u16 str = *(const u16 *)strptr;
+ const u16 str = load_u16_unaligned(strptr);
const u8 *matchptr;
matchptr = strptr - recent_offsets[0];
- if (*(const u16 *)matchptr == str)
+ if (load_u16_unaligned(matchptr) == str)
rep_max_len = lz_extend_repmatch(strptr, matchptr, max_len);
else
rep_max_len = 0;
rep_max_idx = 0;
matchptr = strptr - recent_offsets[1];
- if (*(const u16 *)matchptr == str) {
+ if (load_u16_unaligned(matchptr) == str) {
rep_len = lz_extend_repmatch(strptr, matchptr, max_len);
if (rep_len > rep_max_len) {
rep_max_len = rep_len;
}
matchptr = strptr - recent_offsets[2];
- if (*(const u16 *)matchptr == str) {
+ if (load_u16_unaligned(matchptr) == str) {
rep_len = lz_extend_repmatch(strptr, matchptr, max_len);
if (rep_len > rep_max_len) {
rep_max_len = rep_len;
#define _WIMLIB_LZX_H
#include "wimlib/assert.h"
+#include "wimlib/bitops.h"
#include "wimlib/compiler.h"
#include "wimlib/lzx_constants.h"
#include "wimlib/util.h"
return (adjusted_offset >> 17) + 34;
} else {
LZX_ASSERT(2 <= adjusted_offset && adjusted_offset < 655360);
- unsigned mssb_idx = bsr32(adjusted_offset);
+ unsigned mssb_idx = fls32(adjusted_offset);
return (mssb_idx << 1) |
((adjusted_offset >> (mssb_idx - 1)) & 1);
}
# define WIMLIB_WIMSTRUCT_DECLARED
#endif
-#endif
+/*
+ * Type of a machine word. 'unsigned long' would be logical, but that is only
+ * 32 bits on x86_64 Windows. The same applies to 'uint_fast32_t'. So the best
+ * we can do without a bunch of #ifdefs appears to be 'size_t'.
+ */
+typedef size_t machine_word_t;
+
+#define WORDSIZE sizeof(machine_word_t)
+
+#endif /* _WIMLIB_TYPES_H */
#ifndef _WIMLIB_UNALIGNED_H
#define _WIMLIB_UNALIGNED_H
-#include "compiler.h"
-#include "endianness.h"
-#include "types.h"
+#include "wimlib/compiler.h"
+#include "wimlib/endianness.h"
+#include "wimlib/types.h"
#define DEFINE_UNALIGNED_TYPE(type) \
struct type##_unaligned { \
((struct type##_unaligned *)p)->v = val; \
}
+DEFINE_UNALIGNED_TYPE(u16);
+DEFINE_UNALIGNED_TYPE(u32);
+DEFINE_UNALIGNED_TYPE(u64);
DEFINE_UNALIGNED_TYPE(le16);
DEFINE_UNALIGNED_TYPE(le32);
DEFINE_UNALIGNED_TYPE(le64);
+DEFINE_UNALIGNED_TYPE(be16);
DEFINE_UNALIGNED_TYPE(be32);
+DEFINE_UNALIGNED_TYPE(be64);
DEFINE_UNALIGNED_TYPE(size_t);
+DEFINE_UNALIGNED_TYPE(machine_word_t);
+
+#define load_word_unaligned load_machine_word_t_unaligned
+#define store_word_unaligned store_machine_word_t_unaligned
+
+static inline void
+copy_word_unaligned(const void *src, void *dst)
+{
+ store_word_unaligned(load_word_unaligned(src), dst);
+}
+
+static inline machine_word_t
+repeat_byte(u8 b)
+{
+ machine_word_t v;
+
+ BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
+
+ v = b;
+ v |= v << 8;
+ v |= v << 16;
+ v |= v << ((WORDSIZE == 8) ? 32 : 0);
+ return v;
+}
+
+static inline u16
+get_unaligned_u16_le(const void *p)
+{
+ u16 v;
+
+ if (UNALIGNED_ACCESS_IS_FAST) {
+ v = le16_to_cpu(load_le16_unaligned(p));
+ } else {
+ const u8 *p8 = p;
+ v = 0;
+ v |= (u16)p8[0] << 0;
+ v |= (u16)p8[1] << 8;
+ }
+ return v;
+}
+
+static inline u32
+get_unaligned_u32_le(const void *p)
+{
+ u32 v;
+
+ if (UNALIGNED_ACCESS_IS_FAST) {
+ v = le32_to_cpu(load_le32_unaligned(p));
+ } else {
+ const u8 *p8 = p;
+ v = 0;
+ v |= (u32)p8[0] << 0;
+ v |= (u32)p8[1] << 8;
+ v |= (u32)p8[2] << 16;
+ v |= (u32)p8[3] << 24;
+ }
+ return v;
+}
+
+static inline void
+put_unaligned_u16_le(u16 v, void *p)
+{
+ if (UNALIGNED_ACCESS_IS_FAST) {
+ store_le16_unaligned(cpu_to_le16(v), p);
+ } else {
+ u8 *p8 = p;
+ p8[0] = (v >> 0) & 0xFF;
+ p8[1] = (v >> 8) & 0xFF;
+ }
+}
+
+static inline void
+put_unaligned_u32_le(u32 v, void *p)
+{
+ if (UNALIGNED_ACCESS_IS_FAST) {
+ store_le32_unaligned(cpu_to_le32(v), p);
+ } else {
+ u8 *p8 = p;
+ p8[0] = (v >> 0) & 0xFF;
+ p8[1] = (v >> 8) & 0xFF;
+ p8[2] = (v >> 16) & 0xFF;
+ p8[3] = (v >> 24) & 0xFF;
+ }
+}
#endif /* _WIMLIB_UNALIGNED_H */
#include <stdio.h>
-#ifndef min
-#define min(a, b) ({ typeof(a) __a = (a); typeof(b) __b = (b); \
- (__a < __b) ? __a : __b; })
-#endif
-
-#ifndef max
-#define max(a, b) ({ typeof(a) __a = (a); typeof(b) __b = (b); \
- (__a > __b) ? __a : __b; })
-#endif
-
-#ifndef swap
-#define swap(a, b) ({typeof(a) _a = a; (a) = (b); (b) = _a;})
-#endif
-
/**
* container_of - cast a member of a structure out to the containing structure
* @ptr: the pointer to the member.
/* Used for buffering FILE IO in a few places */
#define BUFFER_SIZE 32768
-#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
-
/* Maximum number of array elements to allocate on the stack (used in various
* places when large temporary buffers are needed). */
#define STACK_MAX 32768
wimlib_free_memory(void *p);
extern void *
-wimlib_realloc(void *, size_t) _warn_unused_result_attribute;
+wimlib_realloc(void *, size_t);
extern void *
wimlib_calloc(size_t nmemb, size_t size) _malloc_attribute;
extern void
print_byte_field(const u8 field[], size_t len, FILE *out);
-static inline u32
-bsr32(u32 n)
-{
-#if defined(__x86__) || defined(__x86_64__)
- asm("bsrl %0, %0;"
- : "=r"(n)
- : "0" (n));
- return n;
-#else
- u32 pow = 0;
- while ((n >>= 1) != 0)
- pow++;
- return pow;
-#endif
-}
-
static inline bool
is_power_of_2(unsigned long n)
{
#endif
#include "wimlib/decompress_common.h"
-#include "wimlib/util.h" /* for BUILD_BUG_ON() */
#include <string.h>
+#define USE_WORD_FILL
+
#ifdef __GNUC__
# ifdef __SSE2__
+# undef USE_WORD_FILL
# define USE_SSE2_FILL
# include <emmintrin.h>
-# else
-# define USE_LONG_FILL
# endif
#endif
unsigned stores_per_loop;
unsigned decode_table_pos;
-#ifdef USE_LONG_FILL
- const unsigned entries_per_long = sizeof(unsigned long) / sizeof(decode_table[0]);
+#ifdef USE_WORD_FILL
+ const unsigned entries_per_word = WORDSIZE / sizeof(decode_table[0]);
#endif
#ifdef USE_SSE2_FILL
* have the most entries. From there, the number of entries per
* codeword will decrease. As an optimization, we may begin filling
* entries with SSE2 vector accesses (8 entries/store), then change to
- * 'unsigned long' accesses (2 or 4 entries/store), then change to
+ * 'machine_word_t' accesses (2 or 4 entries/store), then change to
* 16-bit accesses (1 entry/store). */
decode_table_ptr = decode_table;
sym_idx = 0;
for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) {
unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
for (; sym_idx < end_sym_idx; sym_idx++) {
- /* Note: unlike in the 'long' version below, the __m128i
- * type already has __attribute__((may_alias)), so using
- * it to access the decode table, which is an array of
- * unsigned shorts, will not violate strict aliasing.
- */
+ /* Note: unlike in the machine_word_t version below, the
+ * __m128i type already has __attribute__((may_alias)),
+ * so using it to access the decode table, which is an
+ * array of unsigned shorts, will not violate strict
+ * aliasing. */
u16 entry;
__m128i v;
__m128i *p;
}
#endif /* USE_SSE2_FILL */
-#ifdef USE_LONG_FILL
- /* Fill the entries one 'unsigned long' at a time.
+#ifdef USE_WORD_FILL
+ /* Fill the entries one machine word at a time.
* On 32-bit systems this is 2 entries per store, while on 64-bit
* systems this is 4 entries per store. */
- stores_per_loop = (1 << (table_bits - codeword_len)) / entries_per_long;
+ stores_per_loop = (1 << (table_bits - codeword_len)) / entries_per_word;
for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) {
unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
for (; sym_idx < end_sym_idx; sym_idx++) {
- /* Accessing the array of unsigned shorts as unsigned
- * longs would violate strict aliasing and would require
- * compiling the code with -fno-strict-aliasing to
- * guarantee correctness. To work around this problem,
- * use the gcc 'may_alias' extension to define a special
- * unsigned long type that may alias any other in-memory
- * variable. */
- typedef unsigned long __attribute__((may_alias)) aliased_long_t;
-
- unsigned long v;
- aliased_long_t *p;
+ /* Accessing the array of u16 as u32 or u64 would
+ * violate strict aliasing and would require compiling
+ * the code with -fno-strict-aliasing to guarantee
+ * correctness. To work around this problem, use the
+ * gcc 'may_alias' extension. */
+ typedef machine_word_t _may_alias_attribute aliased_word_t;
+
+ machine_word_t v;
+ aliased_word_t *p;
unsigned n;
- BUILD_BUG_ON(sizeof(unsigned long) != 4 &&
- sizeof(unsigned long) != 8);
+ BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
v = MAKE_DIRECT_ENTRY(sorted_syms[sym_idx], codeword_len);
v |= v << 16;
- v |= v << (sizeof(unsigned long) == 8 ? 32 : 0);
+ v |= v << (WORDSIZE == 8 ? 32 : 0);
- p = (aliased_long_t *)decode_table_ptr;
+ p = (aliased_word_t *)decode_table_ptr;
n = stores_per_loop;
do {
decode_table_ptr = p;
}
}
-#endif /* USE_LONG_FILL */
+#endif /* USE_WORD_FILL */
/* Fill the entries one 16-bit integer at a time. */
stores_per_loop = (1 << (table_bits - codeword_len));
struct lz_match *lz_matchptr = matches;
u32 hash;
u32 cur_match;
+ u32 sequence;
if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1))
return 0;
if (unlikely(best_len >= max_len))
return 0;
+ if (UNALIGNED_ACCESS_IS_FAST)
+ sequence = load_u24_unaligned(strptr);
+
/* Search the appropriate hash chain for matches. */
for (; cur_match && depth_remaining--; cur_match = prev_tab[cur_match]) {
if (matchptr[best_len] != strptr[best_len])
goto next_match;
- #if HAVE_FAST_LZ_EXTEND
- if ((*(const u32 *)strptr & 0xFFFFFF) !=
- (*(const u32 *)matchptr & 0xFFFFFF))
- goto next_match;
+ if (UNALIGNED_ACCESS_IS_FAST) {
+ if (load_u24_unaligned(matchptr) != sequence)
+ goto next_match;
+
+ len = lz_extend(strptr, matchptr, 3, max_len);
+
+ if (len > best_len) {
+ best_len = len;
+
+ *lz_matchptr++ = (struct lz_match) {
+ .len = best_len,
+ .offset = strptr - matchptr,
+ };
+
+ if (best_len >= nice_len)
+ break;
+ }
+ } else {
+
+ /* The bytes at indices 'best_len - 1' and '0' are less
+ * important to check separately. But doing so still
+ * gives a slight performance improvement, at least on
+ * x86_64, probably because they create separate
+ * branches for the CPU to predict independently of the
+ * branches in the main comparison loops.
+ */
+ if (matchptr[best_len - 1] != strptr[best_len - 1] ||
+ matchptr[0] != strptr[0])
+ goto next_match;
+
+ for (len = 1; len < best_len - 1; len++)
+ if (matchptr[len] != strptr[len])
+ goto next_match;
- len = lz_extend(strptr, matchptr, 3, max_len);
+ /* The match is the longest found so far --- at least
+ * 'best_len' + 1 bytes. Continue extending it. */
- if (len > best_len) {
- best_len = len;
+ if (++best_len != max_len &&
+ strptr[best_len] == matchptr[best_len])
+ while (++best_len != max_len)
+ if (strptr[best_len] != matchptr[best_len])
+ break;
+ /* Record the match. */
*lz_matchptr++ = (struct lz_match) {
.len = best_len,
.offset = strptr - matchptr,
};
+ /* Terminate the search if 'nice_len' was reached. */
if (best_len >= nice_len)
break;
}
- #else /* HAVE_FAST_LZ_EXTEND */
-
- /* The bytes at indices 'best_len - 1' and '0' are less
- * important to check separately. But doing so still gives a
- * slight performance improvement, at least on x86_64, probably
- * because they create separate branches for the CPU to predict
- * independently of the branches in the main comparison loops.
- */
- if (matchptr[best_len - 1] != strptr[best_len - 1] ||
- matchptr[0] != strptr[0])
- goto next_match;
-
- for (len = 1; len < best_len - 1; len++)
- if (matchptr[len] != strptr[len])
- goto next_match;
-
- /* The match is the longest found so far ---
- * at least 'best_len' + 1 bytes. Continue extending it. */
-
- if (++best_len != max_len && strptr[best_len] == matchptr[best_len])
- while (++best_len != max_len)
- if (strptr[best_len] != matchptr[best_len])
- break;
-
- /* Record the match. */
- *lz_matchptr++ = (struct lz_match) {
- .len = best_len,
- .offset = strptr - matchptr,
- };
-
- /* Terminate the search if 'nice_len' was reached. */
- if (best_len >= nice_len)
- break;
- #endif /* !HAVE_FAST_LZ_EXTEND */
-
next_match:
/* Continue to next match in the chain. */
;
# include "config.h"
#endif
+#include "wimlib/bitops.h"
#include "wimlib/endianness.h"
#include "wimlib/lzms.h"
#include "wimlib/unaligned.h"
LZMS_ASSERT(slot == expected_num_slots);
slot_bases[slot] = final;
- extra_bits[slot - 1] = bsr32(slot_bases[slot] - slot_bases[slot - 1]);
+ extra_bits[slot - 1] = fls32(slot_bases[slot] - slot_bases[slot - 1]);
}
/* Initialize the global offset and length slot tables. */
LZMS_DEBUG("Undid x86 translation at position %d "
"(opcode 0x%02x)", i, data[i]);
void *p32 = &data[i + num_op_bytes];
- u32 n = le32_to_cpu(load_le32_unaligned(p32));
- store_le32_unaligned(cpu_to_le32(n - i), p32);
+ u32 n = get_unaligned_u32_le(p32);
+ put_unaligned_u32_le(n - i, p32);
}
- pos = i + le16_to_cpu(load_le16_unaligned(&data[i + num_op_bytes]));
+ pos = i + get_unaligned_u16_le(&data[i + num_op_bytes]);
} else {
- pos = i + le16_to_cpu(load_le16_unaligned(&data[i + num_op_bytes]));
+ pos = i + get_unaligned_u16_le(&data[i + num_op_bytes]);
if (i - *closest_target_usage_p <= max_trans_offset) {
LZMS_DEBUG("Did x86 translation at position %d "
"(opcode 0x%02x)", i, data[i]);
void *p32 = &data[i + num_op_bytes];
- u32 n = le32_to_cpu(load_le32_unaligned(p32));
- store_le32_unaligned(cpu_to_le32(n + i), p32);
+ u32 n = get_unaligned_u32_le(p32);
+ put_unaligned_u32_le(n + i, p32);
}
}
#include "wimlib/lz_mf.h"
#include "wimlib/lz_repsearch.h"
#include "wimlib/lzms.h"
+#include "wimlib/unaligned.h"
#include "wimlib/util.h"
#include <string.h>
/* Write a coding unit, unless it would underflow the buffer. */
if (os->next != os->begin)
- *--os->next = cpu_to_le16(os->bitbuf >> os->bitcount);
+ put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next);
/* Optimization for call sites that never write more than 16
* bits at once. */
return false;
if (os->bitcount != 0)
- *--os->next = cpu_to_le16(os->bitbuf << (16 - os->bitcount));
+ put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next);
return true;
}
* ((rc->low >> 32) != 0, a.k.a. the carry bit is 1). */
do {
if (likely(rc->next >= rc->begin)) {
- if (rc->next != rc->end)
- *rc->next++ = cpu_to_le16(rc->cache +
- (u16)(rc->low >> 32));
+ if (rc->next != rc->end) {
+ put_unaligned_u16_le(rc->cache +
+ (u16)(rc->low >> 32),
+ rc->next++);
+ }
} else {
rc->next++;
}
#include "wimlib/decompress_common.h"
#include "wimlib/error.h"
#include "wimlib/lzms.h"
+#include "wimlib/unaligned.h"
#include "wimlib/util.h"
#include <limits.h>
if (unlikely(is->num_le16_remaining == 0))
return -1;
- next = le16_to_cpu(*--is->in);
+ next = get_unaligned_u16_le(--is->in);
is->num_le16_remaining--;
is->bitbuf |= next << (sizeof(is->bitbuf) * 8 - is->num_filled_bits - 16);
const le16 *in, size_t in_limit)
{
rd->range = 0xffffffff;
- rd->code = ((u32)le16_to_cpu(in[0]) << 16) |
- ((u32)le16_to_cpu(in[1]) << 0);
+ rd->code = ((u32)get_unaligned_u16_le(&in[0]) << 16) |
+ ((u32)get_unaligned_u16_le(&in[1]) << 0);
rd->in = in + 2;
rd->num_le16_remaining = in_limit - 2;
}
rd->range <<= 16;
if (unlikely(rd->num_le16_remaining == 0))
return -1;
- rd->code = (rd->code << 16) | le16_to_cpu(*rd->in++);
+ rd->code = (rd->code << 16) | get_unaligned_u16_le(rd->in++);
rd->num_le16_remaining--;
}
return 0;
out_next = ctx->out_next;
- lz_copy(out_next, length, offset, ctx->out_end);
+ lz_copy(out_next, length, offset, ctx->out_end, 1);
ctx->out_next = out_next + length;
return 0;
# include "config.h"
#endif
+#include "wimlib/bitops.h"
#include "wimlib/endianness.h"
#include "wimlib/lzx.h"
#include "wimlib/unaligned.h"
if (max_block_size == 0 || max_block_size > LZX_MAX_WINDOW_SIZE)
return 0;
- order = bsr32(max_block_size);
+ order = fls32(max_block_size);
if (((u32)1 << order) != max_block_size)
order++;
{
s32 abs_offset, rel_offset;
- rel_offset = le32_to_cpu(load_le32_unaligned(target));
+ rel_offset = get_unaligned_u32_le(target);
if (rel_offset >= -input_pos && rel_offset < LZX_WIM_MAGIC_FILESIZE) {
if (rel_offset < LZX_WIM_MAGIC_FILESIZE - input_pos) {
/* "good translation" */
/* "compensating translation" */
abs_offset = rel_offset - LZX_WIM_MAGIC_FILESIZE;
}
- store_le32_unaligned(cpu_to_le32(abs_offset), target);
+ put_unaligned_u32_le(abs_offset, target);
}
}
{
s32 abs_offset, rel_offset;
- abs_offset = le32_to_cpu(load_le32_unaligned(target));
+ abs_offset = get_unaligned_u32_le(target);
if (abs_offset >= 0) {
if (abs_offset < LZX_WIM_MAGIC_FILESIZE) {
/* "good translation" */
rel_offset = abs_offset - input_pos;
-
- store_le32_unaligned(cpu_to_le32(rel_offset), target);
+ put_unaligned_u32_le(rel_offset, target);
}
} else {
if (abs_offset >= -input_pos) {
/* "compensating translation" */
rel_offset = abs_offset + LZX_WIM_MAGIC_FILESIZE;
-
- store_le32_unaligned(cpu_to_le32(rel_offset), target);
+ put_unaligned_u32_le(rel_offset, target);
}
}
}
void
lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32))
{
+ u8 *p = data;
#ifdef __SSE2__
/* SSE2 vectorized implementation for x86_64. This speeds up LZX
* decompression by about 5-8% overall. (Usually --- the performance
* consists entirely of 0xe8 bytes. Also, this optimization affects
* compression as well, but the percentage improvement is less because
* LZX compression is much slower than LZX decompression. ) */
- __m128i *p128 = (__m128i *)data;
- u32 valid_mask = 0xFFFFFFFF;
+ if (size >= 32 && (uintptr_t)p % 16 == 0) {
+
+ u32 valid_mask = 0xFFFFFFFF;
- if (size >= 32 && (uintptr_t)data % 16 == 0) {
- __m128i * const end128 = p128 + size / 16 - 1;
+ u8 * const vec_end = p + (size & ~15) - 16;
/* Create a vector of all 0xe8 bytes */
const __m128i e8_bytes = _mm_set1_epi8(0xe8);
/* Compare the current 16-byte vector with the vector of
* all 0xe8 bytes. This produces 0xff where the byte is
* 0xe8 and 0x00 where it is not. */
- __m128i cmpresult = _mm_cmpeq_epi8(*p128, e8_bytes);
+ __m128i cmpresult = _mm_cmpeq_epi8(*(const __m128i *)p,
+ e8_bytes);
/* Map the comparison results into a single 16-bit
* number. It will contain a 1 bit when the
* index of the byte, within the 16, at
* which the next e8 translation should
* be done. */
- u32 bit = __builtin_ctz(e8_mask);
+ int bit = ffs32(e8_mask);
/* Do (or undo) the e8 translation. */
- u8 *p8 = (u8 *)p128 + bit;
- (*process_target)(p8 + 1,
- p8 - data);
+ (*process_target)(p + bit + 1,
+ p + bit - data);
/* Don't start an e8 translation in the
* next 4 bytes. */
valid_mask >>= 16;
valid_mask |= 0xFFFF0000;
}
- } while (++p128 < end128);
- }
+ } while ((p += 16) < vec_end);
- u8 *p8 = (u8 *)p128;
- while (!(valid_mask & 1)) {
- p8++;
- valid_mask >>= 1;
+ while (!(valid_mask & 1)) {
+ p++;
+ valid_mask >>= 1;
+ }
}
-#else /* __SSE2__ */
- u8 *p8 = data;
#endif /* !__SSE2__ */
if (size > 10) {
/* Finish any bytes that weren't processed by the vectorized
* implementation. */
- u8 *p8_end = data + size - 10;
+ u8 *end = data + size - 10;
do {
- if (*p8 == 0xe8) {
- (*process_target)(p8 + 1, p8 - data);
- p8 += 5;
+ if (*p == 0xe8) {
+ (*process_target)(p + 1, p - data);
+ p += 5;
} else {
- p8++;
+ p++;
}
- } while (p8 < p8_end);
+ } while (p < end);
}
}
/* Write a coding unit, unless it would overflow the buffer. */
if (os->next != os->end)
- *os->next++ = cpu_to_le16(os->bitbuf >> os->bitcount);
+ put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next++);
/* If writing 17 bits, a second coding unit might need to be
* written. But because 'max_num_bits' is a compile-time
* call sites. */
if (max_num_bits == 17 && os->bitcount == 16) {
if (os->next != os->end)
- *os->next++ = cpu_to_le16(os->bitbuf);
+ put_unaligned_u16_le(os->bitbuf, os->next++);
os->bitcount = 0;
}
}
return 0;
if (os->bitcount != 0)
- *os->next++ = cpu_to_le16(os->bitbuf << (16 - os->bitcount));
+ put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next++);
return (const u8 *)os->next - (const u8 *)os->start;
}
if (unlikely(match_offset > window_ptr - window))
return -1;
- lz_copy(window_ptr, match_len, match_offset, window_end);
+ lz_copy(window_ptr, match_len, match_offset, window_end,
+ LZX_MIN_MATCH_LEN);
window_ptr += match_len;
}
#endif
#include "wimlib/assert.h"
+#include "wimlib/bitops.h"
#include "wimlib/endianness.h"
#include "wimlib/error.h"
#include "wimlib/file_io.h"
}
}
- const u32 chunk_order = bsr32(chunk_size);
+ const u32 chunk_order = fls32(chunk_size);
/* Calculate the total number of chunks the resource is divided into. */
const u64 num_chunks = (rspec->uncompressed_size + chunk_size - 1) >> chunk_order;
/* Now fill in chunk_offsets from the entries we have read in
* chunk_tab_data. We break aliasing rules here to avoid having
* to allocate yet another array. */
- typedef le64 __attribute__((may_alias)) aliased_le64_t;
- typedef le32 __attribute__((may_alias)) aliased_le32_t;
+ typedef le64 _may_alias_attribute aliased_le64_t;
+ typedef le32 _may_alias_attribute aliased_le32_t;
u64 * chunk_offsets_p = chunk_offsets;
if (alt_chunk_table) {
# include "config.h"
#endif
+/* Make sure the POSIX-compatible strerror_r() is declared, rather than the GNU
+ * version, which has a different return type. */
#ifdef _GNU_SOURCE
# define _GNU_SOURCE_DEFINED 1
# undef _GNU_SOURCE
+# ifndef _POSIX_C_SOURCE
+# define _POSIX_C_SOURCE 200112L
+# endif
#endif
-/* Make sure the POSIX-compatible strerror_r() is declared, rather than the GNU
- * version, which has a different return type. */
#include <string.h>
#ifdef _GNU_SOURCE_DEFINED
# define _GNU_SOURCE
#endif
#include "wimlib.h"
+#include "wimlib/bitops.h"
#include "wimlib/dentry.h"
#include "wimlib/encoding.h"
#include "wimlib/file_io.h"
/* Chunk size must be power of 2. */
if (chunk_size == 0)
return false;
- order = bsr32(chunk_size);
+ order = fls32(chunk_size);
if (chunk_size != 1U << order)
return false;
0 != (ctx->write_resource_flags &
WRITE_RESOURCE_FLAG_PACK_STREAMS));
- typedef le64 __attribute__((may_alias)) aliased_le64_t;
- typedef le32 __attribute__((may_alias)) aliased_le32_t;
+ typedef le64 _may_alias_attribute aliased_le64_t;
+ typedef le32 _may_alias_attribute aliased_le32_t;
if (chunk_entry_size == 4) {
aliased_le32_t *entries = (aliased_le32_t*)ctx->chunk_csizes;
# include "config.h"
#endif
+#include "wimlib/bitops.h"
#include "wimlib/compress_common.h"
#include "wimlib/compressor_ops.h"
#include "wimlib/endianness.h"
#include "wimlib/error.h"
#include "wimlib/lz_mf.h"
+#include "wimlib/unaligned.h"
#include "wimlib/util.h"
#include "wimlib/xpress.h"
if (os->bitcount > 16) {
os->bitcount -= 16;
if (os->end - os->next_byte >= 2) {
- *(le16 *)os->next_bits = cpu_to_le16(os->bitbuf >> os->bitcount);
+ put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next_bits);
os->next_bits = os->next_bits2;
os->next_bits2 = os->next_byte;
os->next_byte += 2;
if (unlikely(os->end - os->next_byte < 2))
return 0;
- *(le16 *)os->next_bits = cpu_to_le16(os->bitbuf << (16 - os->bitcount));
- *(le16 *)os->next_bits2 = cpu_to_le16(0);
+ put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next_bits);
+ put_unaligned_u16_le(0, os->next_bits2);
return os->next_byte - os->start;
}
{
unsigned adjusted_len = len - XPRESS_MIN_MATCH_LEN;
unsigned len_hdr = min(adjusted_len, 0xf);
- unsigned offset_bsr = bsr32(offset);
- unsigned sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
+ unsigned offset_high_bit = fls32(offset);
+ unsigned sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
c->freqs[sym]++;
*(*next_chosen_item)++ = (struct xpress_item) {
.data = (u64)sym |
((u64)adjusted_len << 9) |
- ((u64)offset_bsr << 25) |
- ((u64)(offset ^ (1U << offset_bsr)) << 29),
+ ((u64)offset_high_bit << 25) |
+ ((u64)(offset ^ (1U << offset_high_bit)) << 29),
};
}
}
u32 cost;
u32 position_cost;
unsigned offset;
- unsigned offset_bsr;
+ unsigned offset_high_bit;
unsigned adjusted_len;
unsigned len_hdr;
unsigned sym;
/* All lengths are small. Optimize accordingly. */
do {
offset = matches[i].offset;
- offset_bsr = bsr32(offset);
+ offset_high_bit = fls32(offset);
len_hdr = len - XPRESS_MIN_MATCH_LEN;
- sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
+ sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
- position_cost = cur_optimum_ptr->cost + offset_bsr;
+ position_cost = cur_optimum_ptr->cost + offset_high_bit;
do {
cost = position_cost + c->costs[sym];
if (cost < (cur_optimum_ptr + len)->cost) {
/* Some lengths are big. */
do {
offset = matches[i].offset;
- offset_bsr = bsr32(offset);
- position_cost = cur_optimum_ptr->cost + offset_bsr;
+ offset_high_bit = fls32(offset);
+ position_cost = cur_optimum_ptr->cost + offset_high_bit;
do {
adjusted_len = len - XPRESS_MIN_MATCH_LEN;
len_hdr = min(adjusted_len, 0xf);
- sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
+ sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
cost = position_cost + c->costs[sym];
if (adjusted_len >= 0xf) {
u8 *window_end = &window[window_size];
unsigned sym;
unsigned match_len;
- unsigned offset_bsr;
+ unsigned offset_high_bit;
unsigned match_offset;
while (window_ptr != window_end) {
/* Match */
match_len = sym & 0xf;
- offset_bsr = (sym >> 4) & 0xf;
+ offset_high_bit = (sym >> 4) & 0xf;
bitstream_ensure_bits(istream, 16);
- match_offset = (1 << offset_bsr) |
- bitstream_pop_bits(istream, offset_bsr);
+ match_offset = (1 << offset_high_bit) |
+ bitstream_pop_bits(istream, offset_high_bit);
if (match_len == 0xf) {
match_len += bitstream_read_byte(istream);
if (unlikely(match_len > window_end - window_ptr))
return -1;
- lz_copy(window_ptr, match_len, match_offset, window_end);
+ lz_copy(window_ptr, match_len, match_offset, window_end,
+ XPRESS_MIN_MATCH_LEN);
window_ptr += match_len;
}