From 0ecb0529b5fcacc1abafa1f3f02a40c44783ada8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Tue, 9 Dec 2014 19:40:17 -0600
Subject: [PATCH] portability and compression cleanups

- Split compiler.h into compiler.h and compiler-gcc.h
- Compile with -std=c99. Some GCC extensions are still used but they are
  explicitly defined in compiler-gcc.h.
- Add get_unaligned_uXX_le() and put_unaligned_uXX_le() helpers
- Faster lz_copy() in some cases
- Faster lz_extend() in some cases
- New bitops.h header
---
 Makefile.am                        |   4 +-
 include/wimlib/bitops.h            |  92 ++++++++++++++++++++++++
 include/wimlib/compiler-gcc.h      |  69 ++++++++++++++++++
 include/wimlib/compiler.h          | 106 ++++++++++++++++++----------
 include/wimlib/decompress_common.h | 108 +++++++++++++++++++----------
 include/wimlib/endianness.h        | 108 ++++++++++++++++-------------
 include/wimlib/lz_extend.h         |  75 +++++++++-----------
 include/wimlib/lz_hash3.h          |  47 ++++++++-----
 include/wimlib/lz_repsearch.h      |   9 +--
 include/wimlib/lzx.h               |   3 +-
 include/wimlib/types.h             |  11 ++-
 include/wimlib/unaligned.h         |  95 ++++++++++++++++++++++++-
 include/wimlib/util.h              |  34 +--------
 src/decompress_common.c            |  57 ++++++++-------
 src/lz_hash_chains.c               |  88 ++++++++++++-----------
 src/lzms-common.c                  |  15 ++--
 src/lzms-compress.c                |  13 ++--
 src/lzms-decompress.c              |  11 +--
 src/lzx-common.c                   |  59 ++++++++--------
 src/lzx-compress.c                 |   6 +-
 src/lzx-decompress.c               |   3 +-
 src/resource.c                     |   7 +-
 src/util.c                         |   7 +-
 src/wim.c                          |   3 +-
 src/write.c                        |   4 +-
 src/xpress-compress.c              |  30 ++++----
 src/xpress-decompress.c            |  11 +--
 27 files changed, 703 insertions(+), 372 deletions(-)
 create mode 100644 include/wimlib/bitops.h
 create mode 100644 include/wimlib/compiler-gcc.h

diff --git a/Makefile.am b/Makefile.am
index e49c5be6..a9846de6 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -3,7 +3,7 @@ ACLOCAL_AMFLAGS = -I m4
 AM_CPPFLAGS	= -I$(top_srcdir)/include $(WINDOWS_CPPFLAGS) \
 		  -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
 
-AM_CFLAGS	= -std=gnu99 -Wmissing-prototypes -Wstrict-prototypes	\
+AM_CFLAGS	= -std=c99 -Wmissing-prototypes -Wstrict-prototypes	\
 		  -Werror-implicit-function-declaration			\
 		  -fno-common -Wundef -Wno-pointer-sign
 
@@ -82,10 +82,12 @@ libwim_la_SOURCES =		\
 	include/wimlib/apply.h		\
 	include/wimlib/assert.h		\
 	include/wimlib/avl_tree.h	\
+	include/wimlib/bitops.h		\
 	include/wimlib/callback.h	\
 	include/wimlib/capture.h	\
 	include/wimlib/case.h		\
 	include/wimlib/compiler.h	\
+	include/wimlib/compiler-gcc.h	\
 	include/wimlib/compressor_ops.h	\
 	include/wimlib/compress_common.h	\
 	include/wimlib/chunk_compressor.h	\
diff --git a/include/wimlib/bitops.h b/include/wimlib/bitops.h
new file mode 100644
index 00000000..2e7a948f
--- /dev/null
+++ b/include/wimlib/bitops.h
@@ -0,0 +1,92 @@
+/*
+ * bitops.h
+ *
+ * Inline functions for bit manipulation.
+ *
+ * The author dedicates this file to the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#ifndef _WIMLIB_BITOPS_H
+#define _WIMLIB_BITOPS_H
+
+#include "wimlib/compiler.h"
+#include "wimlib/types.h"
+
+/* Find Last Set bit   */
+
+static inline unsigned
+fls32(u32 v)
+{
+#ifdef compiler_fls32
+	return compiler_fls32(v);
+#else
+	unsigned bit = 0;
+	while ((v >>= 1) != 0)
+		bit++;
+	return bit;
+#endif
+}
+
+static inline unsigned
+fls64(u64 v)
+{
+#ifdef compiler_fls64
+	return compiler_fls64(v);
+#else
+	unsigned bit = 0;
+	while ((v >>= 1) != 0)
+		bit++;
+	return bit;
+#endif
+}
+
+static inline unsigned
+flsw(machine_word_t v)
+{
+	BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
+	if (WORDSIZE == 4)
+		return fls32(v);
+	else
+		return fls64(v);
+}
+
+/* Find First Set bit   */
+
+static inline unsigned
+ffs32(u32 v)
+{
+#ifdef compiler_ffs32
+	return compiler_ffs32(v);
+#else
+	unsigned bit;
+	for (bit = 0; !(v & 1); bit++, v >>= 1)
+		;
+	return bit;
+#endif
+}
+
+static inline unsigned
+ffs64(u64 v)
+{
+#ifdef compiler_ffs64
+	return compiler_ffs64(v);
+#else
+	unsigned bit;
+	for (bit = 0; !(v & 1); bit++, v >>= 1)
+		;
+	return bit;
+#endif
+}
+
+static inline unsigned
+ffsw(machine_word_t v)
+{
+	BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
+	if (WORDSIZE == 4)
+		return ffs32(v);
+	else
+		return ffs64(v);
+}
+
+#endif /* _WIMLIB_BITOPS_H */
diff --git a/include/wimlib/compiler-gcc.h b/include/wimlib/compiler-gcc.h
new file mode 100644
index 00000000..db6e3249
--- /dev/null
+++ b/include/wimlib/compiler-gcc.h
@@ -0,0 +1,69 @@
+/*
+ * compiler-gcc.h
+ *
+ * The author dedicates this file to the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#ifndef _WIMLIB_COMPILER_GCC_H
+#define _WIMLIB_COMPILER_GCC_H
+
+#ifdef __WIN32__
+#  define WIMLIBAPI __declspec(dllexport)
+#else
+#  define WIMLIBAPI __attribute__((visibility("default")))
+#endif
+
+#define _packed_attribute	__attribute__((packed))
+#define _aligned_attribute(n)	__attribute__((aligned(n)))
+#define _may_alias_attribute	__attribute__((may_alias))
+#define likely(expr)		__builtin_expect(!!(expr), 1)
+#define unlikely(expr)		__builtin_expect(!!(expr), 0)
+#define prefetch(addr)		__builtin_prefetch(addr)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
+#  define _cold_attribute	__attribute__((cold))
+#endif
+#define _malloc_attribute	__attribute__((malloc))
+#define inline			inline __attribute__((always_inline))
+
+#define CPU_IS_BIG_ENDIAN	(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+
+#if defined(__x86_64__) || defined(__i386__)
+#  define UNALIGNED_ACCESS_SPEED 3
+#elif defined(__ARM_FEATURE_UNALIGNED) && (__ARM_FEATURE_UNALIGNED == 1)
+#  define UNALIGNED_ACCESS_SPEED 2
+#else
+#  define UNALIGNED_ACCESS_SPEED 0
+#endif
+
+#define typeof     __typeof__
+
+#ifndef min
+#  define min(a, b)  ({ typeof(a) _a = (a); typeof(b) _b = (b); \
+			(_a < _b) ? _a : _b; })
+#endif
+
+#ifndef max
+#  define max(a, b)  ({ typeof(a) _a = (a); typeof(b) _b = (b); \
+			(_a > _b) ? _a : _b; })
+#endif
+
+#ifndef swap
+#  define swap(a, b) ({ typeof(a) _a = (a); (a) = (b); (b) = _a; })
+#endif
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
+#  define compiler_bswap32 __builtin_bswap32
+#  define compiler_bswap64 __builtin_bswap64
+#endif
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+#  define compiler_bswap16 __builtin_bswap16
+#endif
+
+#define compiler_fls32(n)	(31 - __builtin_clz(n))
+#define compiler_fls64(n)	(63 - __builtin_clzll(n))
+#define compiler_ffs32(n)	__builtin_ctz(n)
+#define compiler_ffs64(n)	__builtin_ctzll(n)
+
+#endif /* _WIMLIB_COMPILER_GCC_H */
diff --git a/include/wimlib/compiler.h b/include/wimlib/compiler.h
index 221df688..a1f513f5 100644
--- a/include/wimlib/compiler.h
+++ b/include/wimlib/compiler.h
@@ -9,44 +9,72 @@
 #define _WIMLIB_COMPILER_H
 
 #ifdef __GNUC__
-#	if defined(__CYGWIN__) || defined(__WIN32__)
-#		define WIMLIBAPI __declspec(dllexport)
-#	else
-#		define WIMLIBAPI __attribute__((visibility("default")))
-#	endif
-#	define _always_inline_attribute inline __attribute__((always_inline))
-#	define _no_inline_attribute __attribute__((noinline))
-#	define _packed_attribute __attribute__((packed))
-#	define _format_attribute(type, format_str, args_start) \
-			/*__attribute__((format(type, format_str, args_start))) */
-#	if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
-#		define _cold_attribute     __attribute__((cold))
-#	else
-#		define _cold_attribute
-#	endif
-#	define _malloc_attribute __attribute__((malloc))
-#	define _warn_unused_result_attribute __attribute__((warn_unused_result))
-#	define _aligned_attribute(size) __attribute__((aligned(size)))
-#	define likely(x) __builtin_expect(!!(x), 1)
-#	define unlikely(x) __builtin_expect(!!(x), 0)
-#	define inline inline __attribute__((always_inline))
-#	define prefetch(x) __builtin_prefetch(x)
-#	define is_constant(x) __builtin_constant_p(x)
+#  include "wimlib/compiler-gcc.h"
 #else
-#	define WIMLIBAPI
-#	define _always_inline_attribute inline
-#	define _no_inline_attribute
-#	define _format_attribute(type, format_str, args_start)
-#	define _cold_attribute
-#	define _packed_attribute
-#	define _malloc_attribute
-#	define _warn_unused_result_attribute
-#	define _aligned_attribute(size)
-#	define likely(x) (x)
-#	define unlikely(x) (x)
-#	define prefetch(x)
-#	define is_constant(x) (0)
-#endif /* __GNUC__ */
+#  error "Unrecognized compiler.  Please add a header file for your compiler."
+#endif
+
+#ifndef WIMLIBAPI
+#  define WIMLIBAPI
+#endif
+
+#ifndef _packed_attribute
+#  error "missing required definition of _packed_attribute"
+#endif
+
+#ifndef _aligned_attribute
+#  error "missing required definition of _aligned_attribute"
+#endif
+
+#ifndef _may_alias_attribute
+#  error "missing required definition of _may_alias_attribute"
+#endif
+
+#ifndef likely
+#  define likely(expr)		(expr)
+#endif
+
+#ifndef unlikely
+#  define unlikely(expr)	(expr)
+#endif
+
+#ifndef prefetch
+#  define prefetch(addr)
+#endif
+
+#ifndef _cold_attribute
+#  define _cold_attribute
+#endif
+
+#ifndef _malloc_attribute
+#  define _malloc_attribute
+#endif
+
+#ifndef _format_attribute
+#  define _format_attribute(type, format_str, format_start)
+#endif
+
+#ifndef CPU_IS_BIG_ENDIAN
+#  error "missing required definition of CPU_IS_BIG_ENDIAN"
+#endif
+
+#define CPU_IS_LITTLE_ENDIAN (!CPU_IS_BIG_ENDIAN)
+
+#ifndef UNALIGNED_ACCESS_SPEED
+#  define UNALIGNED_ACCESS_SPEED 0
+#endif
+
+#define UNALIGNED_ACCESS_IS_ALLOWED	(UNALIGNED_ACCESS_SPEED >= 1)
+#define UNALIGNED_ACCESS_IS_FAST	(UNALIGNED_ACCESS_SPEED >= 2)
+#define UNALIGNED_ACCESS_IS_VERY_FAST	(UNALIGNED_ACCESS_SPEED >= 3)
+
+#ifndef typeof
+#  error "missing required definition of typeof"
+#endif
+
+#if !defined(min) || !defined(max) || !defined(swap)
+#  error "missing required definitions of min(), max(), and swap() macros"
+#endif
 
 #ifdef __CHECKER__
 #  define _bitwise_attr	__attribute__((bitwise))
@@ -56,4 +84,8 @@
 #  define _force_attr
 #endif
 
+#ifndef BUILD_BUG_ON
+#  define BUILD_BUG_ON(expr)	((void)sizeof(char[1 - 2*!!(expr)]))
+#endif
+
 #endif /* _WIMLIB_COMPILER_H */
diff --git a/include/wimlib/decompress_common.h b/include/wimlib/decompress_common.h
index 867e84a0..6762043c 100644
--- a/include/wimlib/decompress_common.h
+++ b/include/wimlib/decompress_common.h
@@ -14,6 +14,7 @@
 #include "wimlib/compiler.h"
 #include "wimlib/endianness.h"
 #include "wimlib/types.h"
+#include "wimlib/unaligned.h"
 
 /* Structure that encapsulates a block of in-memory data being interpreted as a
  * stream of bits, optionally with interwoven literal bytes.  Bits are assumed
@@ -67,8 +68,7 @@ bitstream_ensure_bits(struct input_bitstream *is, const unsigned num_bits)
 	if (unlikely(is->end - is->next < 2))
 		goto overflow;
 
-	is->bitbuf |= (u32)le16_to_cpu(*(const le16 *)is->next)
-			<< (16 - is->bitsleft);
+	is->bitbuf |= (u32)get_unaligned_u16_le(is->next) << (16 - is->bitsleft);
 	is->next += 2;
 	is->bitsleft += 16;
 
@@ -76,7 +76,7 @@ bitstream_ensure_bits(struct input_bitstream *is, const unsigned num_bits)
 		if (unlikely(is->end - is->next < 2))
 			goto overflow;
 
-		is->bitbuf |= (u32)le16_to_cpu(*(const le16 *)is->next);
+		is->bitbuf |= (u32)get_unaligned_u16_le(is->next);
 		is->next += 2;
 		is->bitsleft = 32;
 	}
@@ -144,7 +144,7 @@ bitstream_read_u16(struct input_bitstream *is)
 
 	if (unlikely(is->end - is->next < 2))
 		return 0;
-	v = le16_to_cpu(*(const le16 *)is->next);
+	v = get_unaligned_u16_le(is->next);
 	is->next += 2;
 	return v;
 }
@@ -157,7 +157,7 @@ bitstream_read_u32(struct input_bitstream *is)
 
 	if (unlikely(is->end - is->next < 4))
 		return 0;
-	v = le32_to_cpu(*(const le32 *)is->next);
+	v = get_unaligned_u32_le(is->next);
 	is->next += 4;
 	return v;
 }
@@ -256,7 +256,7 @@ make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
 
 
 /*
- * Copy a LZ77 match at (dst - offset) to dst.
+ * Copy an LZ77 match at (dst - offset) to dst.
  *
  * The length and offset must be already validated --- that is, (dst - offset)
  * can't underrun the output buffer, and (dst + length) can't overrun the output
@@ -266,50 +266,88 @@ make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
  * This function won't write any data beyond this position.
  */
 static inline void
-lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend)
+lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend, u32 min_length)
 {
 	const u8 *src = dst - offset;
-#if defined(__x86_64__) || defined(__i386__)
-	/* Copy one 'unsigned long' at a time.  On i386 and x86_64 this is
+	const u8 * const end = dst + length;
+
+	/*
+	 * Try to copy one machine word at a time.  On i386 and x86_64 this is
 	 * faster than copying one byte at a time, unless the data is
 	 * near-random and all the matches have very short lengths.  Note that
 	 * since this requires unaligned memory accesses, it won't necessarily
 	 * be faster on every architecture.
 	 *
 	 * Also note that we might copy more than the length of the match.  For
-	 * example, if an 'unsigned long' is 8 bytes and the match is of length
-	 * 5, then we'll simply copy 8 bytes.  This is okay as long as we don't
-	 * write beyond the end of the output buffer, hence the check for
-	 * (winend - (dst + length) >= sizeof(unsigned long) - 1).  */
-	if (offset >= sizeof(unsigned long) &&
-			winend - (dst + length) >= sizeof(unsigned long) - 1)
+	 * example, if a word is 8 bytes and the match is of length 5, then
+	 * we'll simply copy 8 bytes.  This is okay as long as we don't write
+	 * beyond the end of the output buffer, hence the check for (winend -
+	 * end >= WORDSIZE - 1).
+	 */
+	if (UNALIGNED_ACCESS_IS_VERY_FAST &&
+	    likely(winend - end >= WORDSIZE - 1))
 	{
-		/* Access memory through a packed struct.  This tricks the
-		 * compiler into allowing unaligned memory accesses.  */
-		struct ulong_wrapper {
-			unsigned long v;
-		} _packed_attribute;
-
-		const u8 * const end = dst + length;
-		unsigned long v;
-
-		v = ((struct ulong_wrapper *)src)->v;
-		((struct ulong_wrapper *)dst)->v = v;
-		dst += sizeof(unsigned long);
-		src += sizeof(unsigned long);
 
-		if (dst < end) {
+		if (offset >= WORDSIZE) {
+			/* The source and destination words don't overlap.  */
+
+			/* To improve branch prediction, one iteration of this
+			 * loop is unrolled.  Most matches are short and will
+			 * fail the first check.  But if that check passes, then
+			 * it becomes increasing likely that the match is long
+			 * and we'll need to continue copying.  */
+
+			copy_word_unaligned(src, dst);
+			src += WORDSIZE;
+			dst += WORDSIZE;
+
+			if (dst < end) {
+				do {
+					copy_word_unaligned(src, dst);
+					src += WORDSIZE;
+					dst += WORDSIZE;
+				} while (dst < end);
+			}
+			return;
+		} else if (offset == 1) {
+
+			/* Offset 1 matches are equivalent to run-length
+			 * encoding of the previous byte.  This case is common
+			 * if the data contains many repeated bytes.  */
+
+			machine_word_t v = repeat_byte(*(dst - 1));
 			do {
-				v = ((struct ulong_wrapper *)src)->v;
-				((struct ulong_wrapper *)dst)->v = v;
-				dst += sizeof(unsigned long);
-				src += sizeof(unsigned long);
+				store_word_unaligned(v, dst);
+				src += WORDSIZE;
+				dst += WORDSIZE;
 			} while (dst < end);
+			return;
 		}
+		/*
+		 * We don't bother with special cases for other 'offset <
+		 * WORDSIZE', which are usually rarer than 'offset == 1'.  Extra
+		 * checks will just slow things down.  Actually, it's possible
+		 * to handle all the 'offset < WORDSIZE' cases using the same
+		 * code, but it still becomes more complicated doesn't seem any
+		 * faster overall; it definitely slows down the more common
+		 * 'offset == 1' case.
+		 */
+	}
 
-		return;
+	/* Fall back to a bytewise copy.  */
+
+	if (min_length >= 2) {
+		*dst++ = *src++;
+		length--;
+	}
+	if (min_length >= 3) {
+		*dst++ = *src++;
+		length--;
+	}
+	if (min_length >= 4) {
+		*dst++ = *src++;
+		length--;
 	}
-#endif
 	do {
 		*dst++ = *src++;
 	} while (--length);
diff --git a/include/wimlib/endianness.h b/include/wimlib/endianness.h
index 4c1b239f..d4de3350 100644
--- a/include/wimlib/endianness.h
+++ b/include/wimlib/endianness.h
@@ -1,71 +1,85 @@
+/*
+ * endianness.h
+ *
+ * Macros and inline functions for endianness conversion.
+ *
+ * The author dedicates this file to the public domain.
+ * You can do whatever you want with this file.
+ */
+
 #ifndef _WIMLIB_ENDIANNESS_H
 #define _WIMLIB_ENDIANNESS_H
 
+#include "wimlib/compiler.h"
 #include "wimlib/types.h"
 
-/* Watch out for conflict with ntfs-3g/endian.h ... */
-
+/* Watch out for conflict with ntfs-3g/endians.h ... */
 #ifndef _NTFS_ENDIANS_H
 
-static inline u16
-bswap16(u16 n)
+static inline u16 bswap16(u16 n)
 {
+#ifdef compiler_bswap16
+	return compiler_bswap16(n);
+#else
 	return (n << 8) | (n >> 8);
+#endif
 }
 
-static inline u32
-bswap32(u32 n)
+static inline u32 bswap32(u32 n)
 {
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
-	return __builtin_bswap32(n);
+#ifdef compiler_bswap32
+	return compiler_bswap32(n);
 #else
-	return (n << 24) | ((n & 0xff00) << 8) | ((n & 0xff0000) >> 8) |
-							(n >> 24);
+	return (n << 24) |
+	       ((n & 0xFF00) << 8) |
+	       ((n & 0xFF0000) >> 8) |
+	       (n >> 24);
 #endif
 }
 
-static inline u64
-bswap64(u64 n)
+static inline u64 bswap64(u64 n)
 {
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
-	return __builtin_bswap64(n);
+#ifdef compiler_bswap64
+	return compiler_bswap64(n);
 #else
-	return (n << 56) | ((n & 0xff00) << 40) | ((n & 0xff0000) << 24) |
-			((n & 0xff000000) << 8) | ((n & 0xff00000000) >> 8) |
-			((n & 0xff0000000000) >> 24) |
-			((n & 0xff000000000000) >> 40) | (n >> 56);
+	return (n << 56) |
+	       ((n & 0xFF00) << 40) |
+	       ((n & 0xFF0000) << 24) |
+	       ((n & 0xFF000000) << 8) |
+	       ((n & 0xFF00000000) >> 8) |
+	       ((n & 0xFF0000000000) >> 24) |
+	       ((n & 0xFF000000000000) >> 40) |
+	       (n >> 56);
 #endif
 }
 
-#  ifdef WORDS_BIGENDIAN
-#    define cpu_to_le16(n) ((_force_attr le16)bswap16(n))
-#    define cpu_to_le32(n) ((_force_attr le32)bswap32(n))
-#    define cpu_to_le64(n) ((_force_attr le64)bswap64(n))
-#    define le16_to_cpu(n) bswap16((_force_attr u16)(le16)(n))
-#    define le32_to_cpu(n) bswap32((_force_attr u32)(le32)(n))
-#    define le64_to_cpu(n) bswap64((_force_attr u64)(le64)(n))
-
-#    define cpu_to_be16(n) ((_force_attr be16)(u16)(n))
-#    define cpu_to_be32(n) ((_force_attr be32)(u32)(n))
-#    define cpu_to_be64(n) ((_force_attr be64)(u64)(n))
-#    define be16_to_cpu(n) ((_force_attr u16)(be16)(n))
-#    define be32_to_cpu(n) ((_force_attr u32)(be32)(n))
-#    define be64_to_cpu(n) ((_force_attr u64)(be64)(n))
-#  else
-#    define cpu_to_le16(n) ((_force_attr le16)(u16)(n))
-#    define cpu_to_le32(n) ((_force_attr le32)(u32)(n))
-#    define cpu_to_le64(n) ((_force_attr le64)(u64)(n))
-#    define le16_to_cpu(n) ((_force_attr u16)(le16)(n))
-#    define le32_to_cpu(n) ((_force_attr u32)(le32)(n))
-#    define le64_to_cpu(n) ((_force_attr u64)(le64)(n))
+#if CPU_IS_BIG_ENDIAN
+#  define cpu_to_le16(n) ((_force_attr le16)bswap16(n))
+#  define cpu_to_le32(n) ((_force_attr le32)bswap32(n))
+#  define cpu_to_le64(n) ((_force_attr le64)bswap64(n))
+#  define le16_to_cpu(n) bswap16((_force_attr u16)(le16)(n))
+#  define le32_to_cpu(n) bswap32((_force_attr u32)(le32)(n))
+#  define le64_to_cpu(n) bswap64((_force_attr u64)(le64)(n))
+#  define cpu_to_be16(n) ((_force_attr be16)(u16)(n))
+#  define cpu_to_be32(n) ((_force_attr be32)(u32)(n))
+#  define cpu_to_be64(n) ((_force_attr be64)(u64)(n))
+#  define be16_to_cpu(n) ((_force_attr u16)(be16)(n))
+#  define be32_to_cpu(n) ((_force_attr u32)(be32)(n))
+#  define be64_to_cpu(n) ((_force_attr u64)(be64)(n))
+#else
+#  define cpu_to_le16(n) ((_force_attr le16)(u16)(n))
+#  define cpu_to_le32(n) ((_force_attr le32)(u32)(n))
+#  define cpu_to_le64(n) ((_force_attr le64)(u64)(n))
+#  define le16_to_cpu(n) ((_force_attr u16)(le16)(n))
+#  define le32_to_cpu(n) ((_force_attr u32)(le32)(n))
+#  define le64_to_cpu(n) ((_force_attr u64)(le64)(n))
+#  define cpu_to_be16(n) ((_force_attr be16)bswap16(n))
+#  define cpu_to_be32(n) ((_force_attr be32)bswap32(n))
+#  define cpu_to_be64(n) ((_force_attr be64)bswap64(n))
+#  define be16_to_cpu(n) bswap16((_force_attr u16)(be16)(n))
+#  define be32_to_cpu(n) bswap32((_force_attr u32)(be32)(n))
+#  define be64_to_cpu(n) bswap64((_force_attr u64)(be64)(n))
+#endif
 
-#    define cpu_to_be16(n) ((_force_attr be16)bswap16(n))
-#    define cpu_to_be32(n) ((_force_attr be32)bswap32(n))
-#    define cpu_to_be64(n) ((_force_attr be64)bswap64(n))
-#    define be16_to_cpu(n) bswap16((_force_attr u16)(be16)(n))
-#    define be32_to_cpu(n) bswap32((_force_attr u32)(be32)(n))
-#    define be64_to_cpu(n) bswap64((_force_attr u64)(be64)(n))
-#  endif
 #endif /* _NTFS_ENDIANS_H */
-
 #endif /* _WIMLIB_ENDIANNESS_H */
diff --git a/include/wimlib/lz_extend.h b/include/wimlib/lz_extend.h
index 7d7ed8b8..e9a56224 100644
--- a/include/wimlib/lz_extend.h
+++ b/include/wimlib/lz_extend.h
@@ -10,13 +10,8 @@
 #ifndef _WIMLIB_LZ_EXTEND_H
 #define _WIMLIB_LZ_EXTEND_H
 
-#include "wimlib/types.h"
-
-#if (defined(__x86_64__) || defined(__i386__)) && defined(__GNUC__)
-#  define HAVE_FAST_LZ_EXTEND 1
-#else
-#  define HAVE_FAST_LZ_EXTEND 0
-#endif
+#include "wimlib/bitops.h"
+#include "wimlib/unaligned.h"
 
 /* Return the number of bytes at @matchptr that match the bytes at @strptr, up
  * to a maximum of @max_len.  Initially, @start_len bytes are matched.  */
@@ -26,49 +21,45 @@ lz_extend(const u8 * const strptr, const u8 * const matchptr,
 {
 	u32 len = start_len;
 
-#if HAVE_FAST_LZ_EXTEND
-
-	while (len + sizeof(unsigned long) <= max_len) {
-		unsigned long x;
+	if (UNALIGNED_ACCESS_IS_FAST) {
 
-		x = *(const unsigned long *)&matchptr[len] ^
-		    *(const unsigned long *)&strptr[len];
-		if (x != 0)
-			return len + (__builtin_ctzl(x) >> 3);
-		len += sizeof(unsigned long);
-	}
+		machine_word_t v_word;
 
-	if (sizeof(unsigned int) < sizeof(unsigned long) &&
-	    len + sizeof(unsigned int) <= max_len)
-	{
-		unsigned int x;
+		if (likely(max_len - len >= 4 * WORDSIZE)) {
 
-		x = *(const unsigned int *)&matchptr[len] ^
-		    *(const unsigned int *)&strptr[len];
-		if (x != 0)
-			return len + (__builtin_ctz(x) >> 3);
-		len += sizeof(unsigned int);
-	}
+		#define COMPARE_WORD_STEP					\
+			v_word = load_word_unaligned(&matchptr[len]) ^		\
+				 load_word_unaligned(&strptr[len]);		\
+			if (v_word != 0)					\
+				goto word_differs;				\
+			len += WORDSIZE;					\
 
-	if (sizeof(unsigned int) == 4) {
-		if (len < max_len && matchptr[len] == strptr[len]) {
-			len++;
-			if (len < max_len && matchptr[len] == strptr[len]) {
-				len++;
-				if (len < max_len && matchptr[len] == strptr[len]) {
-					len++;
-				}
-			}
+			COMPARE_WORD_STEP
+			COMPARE_WORD_STEP
+			COMPARE_WORD_STEP
+			COMPARE_WORD_STEP
+		#undef COMPARE_WORD_STEP
 		}
-		return len;
-	}
 
-#endif /* HAVE_FAST_LZ_EXTEND */
+		while (len + WORDSIZE <= max_len) {
+			v_word = load_word_unaligned(&matchptr[len]) ^
+				 load_word_unaligned(&strptr[len]);
+			if (v_word != 0)
+				goto word_differs;
+			len += WORDSIZE;
+		}
 
-	while (len < max_len && matchptr[len] == strptr[len])
-		len++;
+		while (len < max_len && matchptr[len] == strptr[len])
+			len++;
+		return len;
 
-	return len;
+	word_differs:
+		return len + (ffsw(v_word) >> 3);
+	} else {
+		while (len < max_len && matchptr[len] == strptr[len])
+			len++;
+		return len;
+	}
 }
 
 #endif /* _WIMLIB_LZ_EXTEND_H */
diff --git a/include/wimlib/lz_hash3.h b/include/wimlib/lz_hash3.h
index feb2d9c3..b204984b 100644
--- a/include/wimlib/lz_hash3.h
+++ b/include/wimlib/lz_hash3.h
@@ -10,32 +10,45 @@
 #ifndef _WIMLIB_LZ_HASH3_H
 #define _WIMLIB_LZ_HASH3_H
 
-#include "wimlib/types.h"
+#include "wimlib/unaligned.h"
 
 /* Constant for the multiplicative hash function.  */
 #define LZ_HASH_MULTIPLIER 0x1E35A7BD
 
-/* Hash the next 3-byte sequence in the window, producing a hash of length
- * 'num_bits' bits.  4 bytes must be available, since 32-bit unaligned load is
- * faster on some architectures.  */
 static inline u32
-lz_hash(const u8 *p, unsigned int num_bits)
+loaded_u32_to_u24(u32 v)
 {
-	u32 str;
-	u32 hash;
+	if (CPU_IS_LITTLE_ENDIAN)
+		return v & 0xFFFFFF;
+	else
+		return v >> 8;
+}
 
-#if defined(__i386__) || defined(__x86_64__)
-	/* Unaligned access allowed, and little endian CPU.
-	 * Callers ensure that at least 4 (not 3) bytes are remaining.  */
-	str = *(const u32 *)p & 0xFFFFFF;
-#else
-	str = ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
-#endif
+static inline u32
+load_u24_unaligned(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return loaded_u32_to_u24(load_u32_unaligned(p));
+	else
+		return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
+}
 
-	hash = str * LZ_HASH_MULTIPLIER;
+static inline u32
+lz_hash_u24(u32 str, unsigned num_bits)
+{
+	return (u32)(str * LZ_HASH_MULTIPLIER) >> (32 - num_bits);
+}
 
-	/* High bits are more random than the low bits.  */
-	return hash >> (32 - num_bits);
+/*
+ * Hash the next 3-byte sequence in the window, producing a hash of length
+ * 'num_bits' bits.  At least LZ_HASH_REQUIRED_NBYTES must be available at 'p';
+ * this might be 4 bytes rather than 3 because an unaligned load is faster on
+ * some architectures.
+ */
+static inline u32
+lz_hash(const u8 *p, unsigned num_bits)
+{
+	return lz_hash_u24(load_u24_unaligned(p), num_bits);
 }
 
 /* The number of bytes being hashed.  */
diff --git a/include/wimlib/lz_repsearch.h b/include/wimlib/lz_repsearch.h
index 6883bf62..0f031c12 100644
--- a/include/wimlib/lz_repsearch.h
+++ b/include/wimlib/lz_repsearch.h
@@ -11,6 +11,7 @@
 #define _LZ_REPSEARCH_H
 
 #include "wimlib/lz_extend.h"
+#include "wimlib/unaligned.h"
 
 extern u32
 lz_extend_repmatch(const u8 *strptr, const u8 *matchptr, u32 max_len);
@@ -31,18 +32,18 @@ lz_repsearch3(const u8 * const strptr, const u32 max_len,
 	unsigned rep_max_idx;
 	u32 rep_len;
 	u32 rep_max_len;
-	const u16 str = *(const u16 *)strptr;
+	const u16 str = load_u16_unaligned(strptr);
 	const u8 *matchptr;
 
 	matchptr = strptr - recent_offsets[0];
-	if (*(const u16 *)matchptr == str)
+	if (load_u16_unaligned(matchptr) == str)
 		rep_max_len = lz_extend_repmatch(strptr, matchptr, max_len);
 	else
 		rep_max_len = 0;
 	rep_max_idx = 0;
 
 	matchptr = strptr - recent_offsets[1];
-	if (*(const u16 *)matchptr == str) {
+	if (load_u16_unaligned(matchptr) == str) {
 		rep_len = lz_extend_repmatch(strptr, matchptr, max_len);
 		if (rep_len > rep_max_len) {
 			rep_max_len = rep_len;
@@ -51,7 +52,7 @@ lz_repsearch3(const u8 * const strptr, const u32 max_len,
 	}
 
 	matchptr = strptr - recent_offsets[2];
-	if (*(const u16 *)matchptr == str) {
+	if (load_u16_unaligned(matchptr) == str) {
 		rep_len = lz_extend_repmatch(strptr, matchptr, max_len);
 		if (rep_len > rep_max_len) {
 			rep_max_len = rep_len;
diff --git a/include/wimlib/lzx.h b/include/wimlib/lzx.h
index da0c5514..5bd6b140 100644
--- a/include/wimlib/lzx.h
+++ b/include/wimlib/lzx.h
@@ -8,6 +8,7 @@
 #define _WIMLIB_LZX_H
 
 #include "wimlib/assert.h"
+#include "wimlib/bitops.h"
 #include "wimlib/compiler.h"
 #include "wimlib/lzx_constants.h"
 #include "wimlib/util.h"
@@ -40,7 +41,7 @@ lzx_get_offset_slot_raw(u32 adjusted_offset)
 		return (adjusted_offset >> 17) + 34;
 	} else {
 		LZX_ASSERT(2 <= adjusted_offset && adjusted_offset < 655360);
-		unsigned mssb_idx = bsr32(adjusted_offset);
+		unsigned mssb_idx = fls32(adjusted_offset);
 		return (mssb_idx << 1) |
 			((adjusted_offset >> (mssb_idx - 1)) & 1);
 	}
diff --git a/include/wimlib/types.h b/include/wimlib/types.h
index ac67cf4a..84b55be3 100644
--- a/include/wimlib/types.h
+++ b/include/wimlib/types.h
@@ -47,4 +47,13 @@ typedef struct WIMStruct WIMStruct;
 #  define WIMLIB_WIMSTRUCT_DECLARED
 #endif
 
-#endif
+/*
+ * Type of a machine word.  'unsigned long' would be logical, but that is only
+ * 32 bits on x86_64 Windows.  The same applies to 'uint_fast32_t'.  So the best
+ * we can do without a bunch of #ifdefs appears to be 'size_t'.
+ */
+typedef size_t machine_word_t;
+
+#define WORDSIZE	sizeof(machine_word_t)
+
+#endif /* _WIMLIB_TYPES_H */
diff --git a/include/wimlib/unaligned.h b/include/wimlib/unaligned.h
index 7c2ade22..ccc86f01 100644
--- a/include/wimlib/unaligned.h
+++ b/include/wimlib/unaligned.h
@@ -10,9 +10,9 @@
 #ifndef _WIMLIB_UNALIGNED_H
 #define _WIMLIB_UNALIGNED_H
 
-#include "compiler.h"
-#include "endianness.h"
-#include "types.h"
+#include "wimlib/compiler.h"
+#include "wimlib/endianness.h"
+#include "wimlib/types.h"
 
 #define DEFINE_UNALIGNED_TYPE(type)				\
 struct type##_unaligned {					\
@@ -31,10 +31,99 @@ store_##type##_unaligned(type val, void *p)			\
 	((struct type##_unaligned *)p)->v = val;		\
 }
 
+DEFINE_UNALIGNED_TYPE(u16);
+DEFINE_UNALIGNED_TYPE(u32);
+DEFINE_UNALIGNED_TYPE(u64);
 DEFINE_UNALIGNED_TYPE(le16);
 DEFINE_UNALIGNED_TYPE(le32);
 DEFINE_UNALIGNED_TYPE(le64);
+DEFINE_UNALIGNED_TYPE(be16);
 DEFINE_UNALIGNED_TYPE(be32);
+DEFINE_UNALIGNED_TYPE(be64);
 DEFINE_UNALIGNED_TYPE(size_t);
+DEFINE_UNALIGNED_TYPE(machine_word_t);
+
+#define load_word_unaligned	load_machine_word_t_unaligned
+#define store_word_unaligned	store_machine_word_t_unaligned
+
+static inline void
+copy_word_unaligned(const void *src, void *dst)
+{
+	store_word_unaligned(load_word_unaligned(src), dst);
+}
+
+static inline machine_word_t
+repeat_byte(u8 b)
+{
+	machine_word_t v;
+
+	BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
+
+	v = b;
+	v |= v << 8;
+	v |= v << 16;
+	v |= v << ((WORDSIZE == 8) ? 32 : 0);
+	return v;
+}
+
+static inline u16
+get_unaligned_u16_le(const void *p)
+{
+	u16 v;
+
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		v = le16_to_cpu(load_le16_unaligned(p));
+	} else {
+		const u8 *p8 = p;
+		v = 0;
+		v |= (u16)p8[0] << 0;
+		v |= (u16)p8[1] << 8;
+	}
+	return v;
+}
+
+static inline u32
+get_unaligned_u32_le(const void *p)
+{
+	u32 v;
+
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		v = le32_to_cpu(load_le32_unaligned(p));
+	} else {
+		const u8 *p8 = p;
+		v = 0;
+		v |= (u32)p8[0] << 0;
+		v |= (u32)p8[1] << 8;
+		v |= (u32)p8[2] << 16;
+		v |= (u32)p8[3] << 24;
+	}
+	return v;
+}
+
+static inline void
+put_unaligned_u16_le(u16 v, void *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_le16_unaligned(cpu_to_le16(v), p);
+	} else {
+		u8 *p8 = p;
+		p8[0] = (v >> 0) & 0xFF;
+		p8[1] = (v >> 8) & 0xFF;
+	}
+}
+
+static inline void
+put_unaligned_u32_le(u32 v, void *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_le32_unaligned(cpu_to_le32(v), p);
+	} else {
+		u8 *p8 = p;
+		p8[0] = (v >> 0) & 0xFF;
+		p8[1] = (v >> 8) & 0xFF;
+		p8[2] = (v >> 16) & 0xFF;
+		p8[3] = (v >> 24) & 0xFF;
+	}
+}
 
 #endif /* _WIMLIB_UNALIGNED_H */
diff --git a/include/wimlib/util.h b/include/wimlib/util.h
index 91bed5a4..0aa40d1b 100644
--- a/include/wimlib/util.h
+++ b/include/wimlib/util.h
@@ -6,20 +6,6 @@
 
 #include <stdio.h>
 
-#ifndef min
-#define min(a, b) ({ typeof(a) __a = (a); typeof(b) __b = (b); \
-					(__a < __b) ? __a : __b; })
-#endif
-
-#ifndef max
-#define max(a, b) ({ typeof(a) __a = (a); typeof(b) __b = (b); \
-					(__a > __b) ? __a : __b; })
-#endif
-
-#ifndef swap
-#define swap(a, b) ({typeof(a) _a = a; (a) = (b); (b) = _a;})
-#endif
-
 /**
  * container_of - cast a member of a structure out to the containing structure
  * @ptr:	the pointer to the member.
@@ -46,8 +32,6 @@
 /* Used for buffering FILE IO in a few places */
 #define BUFFER_SIZE 32768
 
-#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
-
 /* Maximum number of array elements to allocate on the stack (used in various
  * places when large temporary buffers are needed).  */
 #define STACK_MAX 32768
@@ -59,7 +43,7 @@ extern void
 wimlib_free_memory(void *p);
 
 extern void *
-wimlib_realloc(void *, size_t) _warn_unused_result_attribute;
+wimlib_realloc(void *, size_t);
 
 extern void *
 wimlib_calloc(size_t nmemb, size_t size) _malloc_attribute;
@@ -107,22 +91,6 @@ randomize_char_array_with_alnum(tchar p[], size_t n);
 extern void
 print_byte_field(const u8 field[], size_t len, FILE *out);
 
-static inline u32
-bsr32(u32 n)
-{
-#if defined(__x86__) || defined(__x86_64__)
-	asm("bsrl %0, %0;"
-			: "=r"(n)
-			: "0" (n));
-	return n;
-#else
-	u32 pow = 0;
-	while ((n >>= 1) != 0)
-		pow++;
-	return pow;
-#endif
-}
-
 static inline bool
 is_power_of_2(unsigned long n)
 {
diff --git a/src/decompress_common.c b/src/decompress_common.c
index 66440efe..4ab9ccbe 100644
--- a/src/decompress_common.c
+++ b/src/decompress_common.c
@@ -15,16 +15,16 @@
 #endif
 
 #include "wimlib/decompress_common.h"
-#include "wimlib/util.h" /* for BUILD_BUG_ON()  */
 
 #include <string.h>
 
+#define USE_WORD_FILL
+
 #ifdef __GNUC__
 #  ifdef __SSE2__
+#    undef USE_WORD_FILL
 #    define USE_SSE2_FILL
 #    include <emmintrin.h>
-#  else
-#    define USE_LONG_FILL
 #  endif
 #endif
 
@@ -142,8 +142,8 @@ make_huffman_decode_table(u16 decode_table[const restrict],
 	unsigned stores_per_loop;
 	unsigned decode_table_pos;
 
-#ifdef USE_LONG_FILL
-	const unsigned entries_per_long = sizeof(unsigned long) / sizeof(decode_table[0]);
+#ifdef USE_WORD_FILL
+	const unsigned entries_per_word = WORDSIZE / sizeof(decode_table[0]);
 #endif
 
 #ifdef USE_SSE2_FILL
@@ -230,7 +230,7 @@ make_huffman_decode_table(u16 decode_table[const restrict],
 	 * have the most entries.  From there, the number of entries per
 	 * codeword will decrease.  As an optimization, we may begin filling
 	 * entries with SSE2 vector accesses (8 entries/store), then change to
-	 * 'unsigned long' accesses (2 or 4 entries/store), then change to
+	 * 'machine_word_t' accesses (2 or 4 entries/store), then change to
 	 * 16-bit accesses (1 entry/store).  */
 	decode_table_ptr = decode_table;
 	sym_idx = 0;
@@ -242,11 +242,11 @@ make_huffman_decode_table(u16 decode_table[const restrict],
 	for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) {
 		unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 		for (; sym_idx < end_sym_idx; sym_idx++) {
-			/* Note: unlike in the 'long' version below, the __m128i
-			 * type already has __attribute__((may_alias)), so using
-			 * it to access the decode table, which is an array of
-			 * unsigned shorts, will not violate strict aliasing.
-			 */
+			/* Note: unlike in the machine_word_t version below, the
+			 * __m128i type already has __attribute__((may_alias)),
+			 * so using it to access the decode table, which is an
+			 * array of unsigned shorts, will not violate strict
+			 * aliasing.  */
 			u16 entry;
 			__m128i v;
 			__m128i *p;
@@ -265,36 +265,33 @@ make_huffman_decode_table(u16 decode_table[const restrict],
 	}
 #endif /* USE_SSE2_FILL */
 
-#ifdef USE_LONG_FILL
-	/* Fill the entries one 'unsigned long' at a time.
+#ifdef USE_WORD_FILL
+	/* Fill the entries one machine word at a time.
 	 * On 32-bit systems this is 2 entries per store, while on 64-bit
 	 * systems this is 4 entries per store.  */
-	stores_per_loop = (1 << (table_bits - codeword_len)) / entries_per_long;
+	stores_per_loop = (1 << (table_bits - codeword_len)) / entries_per_word;
 	for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) {
 		unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 		for (; sym_idx < end_sym_idx; sym_idx++) {
 
-			/* Accessing the array of unsigned shorts as unsigned
-			 * longs would violate strict aliasing and would require
-			 * compiling the code with -fno-strict-aliasing to
-			 * guarantee correctness.  To work around this problem,
-			 * use the gcc 'may_alias' extension to define a special
-			 * unsigned long type that may alias any other in-memory
-			 * variable.  */
-			typedef unsigned long __attribute__((may_alias)) aliased_long_t;
-
-			unsigned long v;
-			aliased_long_t *p;
+			/* Accessing the array of u16 as u32 or u64 would
+			 * violate strict aliasing and would require compiling
+			 * the code with -fno-strict-aliasing to guarantee
+			 * correctness.  To work around this problem, use the
+			 * gcc 'may_alias' extension.  */
+			typedef machine_word_t _may_alias_attribute aliased_word_t;
+
+			machine_word_t v;
+			aliased_word_t *p;
 			unsigned n;
 
-			BUILD_BUG_ON(sizeof(unsigned long) != 4 &&
-				     sizeof(unsigned long) != 8);
+			BUILD_BUG_ON(WORDSIZE != 4 && WORDSIZE != 8);
 
 			v = MAKE_DIRECT_ENTRY(sorted_syms[sym_idx], codeword_len);
 			v |= v << 16;
-			v |= v << (sizeof(unsigned long) == 8 ? 32 : 0);
+			v |= v << (WORDSIZE == 8 ? 32 : 0);
 
-			p = (aliased_long_t *)decode_table_ptr;
+			p = (aliased_word_t *)decode_table_ptr;
 			n = stores_per_loop;
 
 			do {
@@ -303,7 +300,7 @@ make_huffman_decode_table(u16 decode_table[const restrict],
 			decode_table_ptr = p;
 		}
 	}
-#endif /* USE_LONG_FILL */
+#endif /* USE_WORD_FILL */
 
 	/* Fill the entries one 16-bit integer at a time.  */
 	stores_per_loop = (1 << (table_bits - codeword_len));
diff --git a/src/lz_hash_chains.c b/src/lz_hash_chains.c
index 7e3e10b8..c116f125 100644
--- a/src/lz_hash_chains.c
+++ b/src/lz_hash_chains.c
@@ -138,6 +138,7 @@ lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[])
 	struct lz_match *lz_matchptr = matches;
 	u32 hash;
 	u32 cur_match;
+	u32 sequence;
 
 	if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1))
 		return 0;
@@ -159,6 +160,9 @@ lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[])
 	if (unlikely(best_len >= max_len))
 		return 0;
 
+	if (UNALIGNED_ACCESS_IS_FAST)
+		sequence = load_u24_unaligned(strptr);
+
 	/* Search the appropriate hash chain for matches.  */
 	for (; cur_match && depth_remaining--; cur_match = prev_tab[cur_match]) {
 
@@ -173,60 +177,60 @@ lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[])
 		if (matchptr[best_len] != strptr[best_len])
 			goto next_match;
 
-	#if HAVE_FAST_LZ_EXTEND
-		if ((*(const u32 *)strptr & 0xFFFFFF) !=
-		    (*(const u32 *)matchptr & 0xFFFFFF))
-			goto next_match;
+		if (UNALIGNED_ACCESS_IS_FAST) {
+			if (load_u24_unaligned(matchptr) != sequence)
+				goto next_match;
+
+			len = lz_extend(strptr, matchptr, 3, max_len);
+
+			if (len > best_len) {
+				best_len = len;
+
+				*lz_matchptr++ = (struct lz_match) {
+					.len = best_len,
+					.offset = strptr - matchptr,
+				};
+
+				if (best_len >= nice_len)
+					break;
+			}
+		} else {
+
+			/* The bytes at indices 'best_len - 1' and '0' are less
+			 * important to check separately.  But doing so still
+			 * gives a slight performance improvement, at least on
+			 * x86_64, probably because they create separate
+			 * branches for the CPU to predict independently of the
+			 * branches in the main comparison loops.
+			 */
+			 if (matchptr[best_len - 1] != strptr[best_len - 1] ||
+			     matchptr[0] != strptr[0])
+				goto next_match;
+
+			for (len = 1; len < best_len - 1; len++)
+				if (matchptr[len] != strptr[len])
+					goto next_match;
 
-		len = lz_extend(strptr, matchptr, 3, max_len);
+			/* The match is the longest found so far --- at least
+			 * 'best_len' + 1 bytes.  Continue extending it.  */
 
-		if (len > best_len) {
-			best_len = len;
+			if (++best_len != max_len &&
+			    strptr[best_len] == matchptr[best_len])
+				while (++best_len != max_len)
+					if (strptr[best_len] != matchptr[best_len])
+						break;
 
+			/* Record the match.  */
 			*lz_matchptr++ = (struct lz_match) {
 				.len = best_len,
 				.offset = strptr - matchptr,
 			};
 
+			/* Terminate the search if 'nice_len' was reached.  */
 			if (best_len >= nice_len)
 				break;
 		}
 
-	#else /* HAVE_FAST_LZ_EXTEND */
-
-		/* The bytes at indices 'best_len - 1' and '0' are less
-		 * important to check separately.  But doing so still gives a
-		 * slight performance improvement, at least on x86_64, probably
-		 * because they create separate branches for the CPU to predict
-		 * independently of the branches in the main comparison loops.
-		 */
-		 if (matchptr[best_len - 1] != strptr[best_len - 1] ||
-		     matchptr[0] != strptr[0])
-			goto next_match;
-
-		for (len = 1; len < best_len - 1; len++)
-			if (matchptr[len] != strptr[len])
-				goto next_match;
-
-		/* The match is the longest found so far ---
-		 * at least 'best_len' + 1 bytes.  Continue extending it.  */
-
-		if (++best_len != max_len && strptr[best_len] == matchptr[best_len])
-			while (++best_len != max_len)
-				if (strptr[best_len] != matchptr[best_len])
-					break;
-
-		/* Record the match.  */
-		*lz_matchptr++ = (struct lz_match) {
-			.len = best_len,
-			.offset = strptr - matchptr,
-		};
-
-		/* Terminate the search if 'nice_len' was reached.  */
-		if (best_len >= nice_len)
-			break;
-	#endif /* !HAVE_FAST_LZ_EXTEND */
-
 	next_match:
 		/* Continue to next match in the chain.  */
 		;
diff --git a/src/lzms-common.c b/src/lzms-common.c
index 95682d53..41383f4c 100644
--- a/src/lzms-common.c
+++ b/src/lzms-common.c
@@ -26,6 +26,7 @@
 #  include "config.h"
 #endif
 
+#include "wimlib/bitops.h"
 #include "wimlib/endianness.h"
 #include "wimlib/lzms.h"
 #include "wimlib/unaligned.h"
@@ -95,7 +96,7 @@ lzms_decode_delta_rle_slot_bases(u32 slot_bases[],
 	LZMS_ASSERT(slot == expected_num_slots);
 
 	slot_bases[slot] = final;
-	extra_bits[slot - 1] = bsr32(slot_bases[slot] - slot_bases[slot - 1]);
+	extra_bits[slot - 1] = fls32(slot_bases[slot] - slot_bases[slot - 1]);
 }
 
 /* Initialize the global offset and length slot tables.  */
@@ -159,19 +160,19 @@ lzms_maybe_do_x86_translation(u8 data[restrict], s32 i, s32 num_op_bytes,
 			LZMS_DEBUG("Undid x86 translation at position %d "
 				   "(opcode 0x%02x)", i, data[i]);
 			void *p32 = &data[i + num_op_bytes];
-			u32 n = le32_to_cpu(load_le32_unaligned(p32));
-			store_le32_unaligned(cpu_to_le32(n - i), p32);
+			u32 n = get_unaligned_u32_le(p32);
+			put_unaligned_u32_le(n - i, p32);
 		}
-		pos = i + le16_to_cpu(load_le16_unaligned(&data[i + num_op_bytes]));
+		pos = i + get_unaligned_u16_le(&data[i + num_op_bytes]);
 	} else {
-		pos = i + le16_to_cpu(load_le16_unaligned(&data[i + num_op_bytes]));
+		pos = i + get_unaligned_u16_le(&data[i + num_op_bytes]);
 
 		if (i - *closest_target_usage_p <= max_trans_offset) {
 			LZMS_DEBUG("Did x86 translation at position %d "
 				   "(opcode 0x%02x)", i, data[i]);
 			void *p32 = &data[i + num_op_bytes];
-			u32 n = le32_to_cpu(load_le32_unaligned(p32));
-			store_le32_unaligned(cpu_to_le32(n + i), p32);
+			u32 n = get_unaligned_u32_le(p32);
+			put_unaligned_u32_le(n + i, p32);
 		}
 	}
 
diff --git a/src/lzms-compress.c b/src/lzms-compress.c
index da1904fc..3686db0a 100644
--- a/src/lzms-compress.c
+++ b/src/lzms-compress.c
@@ -32,6 +32,7 @@
 #include "wimlib/lz_mf.h"
 #include "wimlib/lz_repsearch.h"
 #include "wimlib/lzms.h"
+#include "wimlib/unaligned.h"
 #include "wimlib/util.h"
 
 #include <string.h>
@@ -338,7 +339,7 @@ lzms_output_bitstream_put_varbits(struct lzms_output_bitstream *os,
 
 		/* Write a coding unit, unless it would underflow the buffer. */
 		if (os->next != os->begin)
-			*--os->next = cpu_to_le16(os->bitbuf >> os->bitcount);
+			put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next);
 
 		/* Optimization for call sites that never write more than 16
 		 * bits at once.  */
@@ -357,7 +358,7 @@ lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
 		return false;
 
 	if (os->bitcount != 0)
-		*--os->next = cpu_to_le16(os->bitbuf << (16 - os->bitcount));
+		put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next);
 
 	return true;
 }
@@ -401,9 +402,11 @@ lzms_range_encoder_raw_shift_low(struct lzms_range_encoder_raw *rc)
 		 * ((rc->low >> 32) != 0, a.k.a. the carry bit is 1).  */
 		do {
 			if (likely(rc->next >= rc->begin)) {
-				if (rc->next != rc->end)
-					*rc->next++ = cpu_to_le16(rc->cache +
-								  (u16)(rc->low >> 32));
+				if (rc->next != rc->end) {
+					put_unaligned_u16_le(rc->cache +
+							     (u16)(rc->low >> 32),
+							     rc->next++);
+				}
 			} else {
 				rc->next++;
 			}
diff --git a/src/lzms-decompress.c b/src/lzms-decompress.c
index 43cd9acb..96c501aa 100644
--- a/src/lzms-decompress.c
+++ b/src/lzms-decompress.c
@@ -209,6 +209,7 @@
 #include "wimlib/decompress_common.h"
 #include "wimlib/error.h"
 #include "wimlib/lzms.h"
+#include "wimlib/unaligned.h"
 #include "wimlib/util.h"
 
 #include <limits.h>
@@ -393,7 +394,7 @@ lzms_input_bitstream_ensure_bits(struct lzms_input_bitstream *is,
 		if (unlikely(is->num_le16_remaining == 0))
 			return -1;
 
-		next = le16_to_cpu(*--is->in);
+		next = get_unaligned_u16_le(--is->in);
 		is->num_le16_remaining--;
 
 		is->bitbuf |= next << (sizeof(is->bitbuf) * 8 - is->num_filled_bits - 16);
@@ -450,8 +451,8 @@ lzms_range_decoder_raw_init(struct lzms_range_decoder_raw *rd,
 			    const le16 *in, size_t in_limit)
 {
 	rd->range = 0xffffffff;
-	rd->code = ((u32)le16_to_cpu(in[0]) << 16) |
-		   ((u32)le16_to_cpu(in[1]) <<  0);
+	rd->code = ((u32)get_unaligned_u16_le(&in[0]) << 16) |
+		   ((u32)get_unaligned_u16_le(&in[1]) <<  0);
 	rd->in = in + 2;
 	rd->num_le16_remaining = in_limit - 2;
 }
@@ -465,7 +466,7 @@ lzms_range_decoder_raw_normalize(struct lzms_range_decoder_raw *rd)
 		rd->range <<= 16;
 		if (unlikely(rd->num_le16_remaining == 0))
 			return -1;
-		rd->code = (rd->code << 16) | le16_to_cpu(*rd->in++);
+		rd->code = (rd->code << 16) | get_unaligned_u16_le(rd->in++);
 		rd->num_le16_remaining--;
 	}
 	return 0;
@@ -667,7 +668,7 @@ lzms_copy_lz_match(struct lzms_decompressor *ctx, u32 length, u32 offset)
 
 	out_next = ctx->out_next;
 
-	lz_copy(out_next, length, offset, ctx->out_end);
+	lz_copy(out_next, length, offset, ctx->out_end, 1);
 	ctx->out_next = out_next + length;
 
 	return 0;
diff --git a/src/lzx-common.c b/src/lzx-common.c
index 9f55f171..1221f61e 100644
--- a/src/lzx-common.c
+++ b/src/lzx-common.c
@@ -23,6 +23,7 @@
 #  include "config.h"
 #endif
 
+#include "wimlib/bitops.h"
 #include "wimlib/endianness.h"
 #include "wimlib/lzx.h"
 #include "wimlib/unaligned.h"
@@ -75,7 +76,7 @@ lzx_get_window_order(size_t max_block_size)
 	if (max_block_size == 0 || max_block_size > LZX_MAX_WINDOW_SIZE)
 		return 0;
 
-	order = bsr32(max_block_size);
+	order = fls32(max_block_size);
 
 	if (((u32)1 << order) != max_block_size)
 		order++;
@@ -125,7 +126,7 @@ do_translate_target(void *target, s32 input_pos)
 {
 	s32 abs_offset, rel_offset;
 
-	rel_offset = le32_to_cpu(load_le32_unaligned(target));
+	rel_offset = get_unaligned_u32_le(target);
 	if (rel_offset >= -input_pos && rel_offset < LZX_WIM_MAGIC_FILESIZE) {
 		if (rel_offset < LZX_WIM_MAGIC_FILESIZE - input_pos) {
 			/* "good translation" */
@@ -134,7 +135,7 @@ do_translate_target(void *target, s32 input_pos)
 			/* "compensating translation" */
 			abs_offset = rel_offset - LZX_WIM_MAGIC_FILESIZE;
 		}
-		store_le32_unaligned(cpu_to_le32(abs_offset), target);
+		put_unaligned_u32_le(abs_offset, target);
 	}
 }
 
@@ -143,20 +144,18 @@ undo_translate_target(void *target, s32 input_pos)
 {
 	s32 abs_offset, rel_offset;
 
-	abs_offset = le32_to_cpu(load_le32_unaligned(target));
+	abs_offset = get_unaligned_u32_le(target);
 	if (abs_offset >= 0) {
 		if (abs_offset < LZX_WIM_MAGIC_FILESIZE) {
 			/* "good translation" */
 			rel_offset = abs_offset - input_pos;
-
-			store_le32_unaligned(cpu_to_le32(rel_offset), target);
+			put_unaligned_u32_le(rel_offset, target);
 		}
 	} else {
 		if (abs_offset >= -input_pos) {
 			/* "compensating translation" */
 			rel_offset = abs_offset + LZX_WIM_MAGIC_FILESIZE;
-
-			store_le32_unaligned(cpu_to_le32(rel_offset), target);
+			put_unaligned_u32_le(rel_offset, target);
 		}
 	}
 }
@@ -195,6 +194,7 @@ inline  /* Although inlining the 'process_target' function still speeds up the
 void
 lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32))
 {
+	u8 *p = data;
 #ifdef __SSE2__
 	/* SSE2 vectorized implementation for x86_64.  This speeds up LZX
 	 * decompression by about 5-8% overall.  (Usually --- the performance
@@ -202,11 +202,11 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32))
 	 * consists entirely of 0xe8 bytes.  Also, this optimization affects
 	 * compression as well, but the percentage improvement is less because
 	 * LZX compression is much slower than LZX decompression. ) */
-	__m128i *p128 = (__m128i *)data;
-	u32 valid_mask = 0xFFFFFFFF;
+	if (size >= 32 && (uintptr_t)p % 16 == 0) {
+
+		u32 valid_mask = 0xFFFFFFFF;
 
-	if (size >= 32 && (uintptr_t)data % 16 == 0) {
-		__m128i * const end128 = p128 + size / 16 - 1;
+		u8 * const vec_end = p + (size & ~15) - 16;
 
 		/* Create a vector of all 0xe8 bytes  */
 		const __m128i e8_bytes = _mm_set1_epi8(0xe8);
@@ -216,7 +216,8 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32))
 			/* Compare the current 16-byte vector with the vector of
 			 * all 0xe8 bytes.  This produces 0xff where the byte is
 			 * 0xe8 and 0x00 where it is not.  */
-			__m128i cmpresult = _mm_cmpeq_epi8(*p128, e8_bytes);
+			__m128i cmpresult = _mm_cmpeq_epi8(*(const __m128i *)p,
+							   e8_bytes);
 
 			/* Map the comparison results into a single 16-bit
 			 * number.  It will contain a 1 bit when the
@@ -244,12 +245,11 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32))
 					 * index of the byte, within the 16, at
 					 * which the next e8 translation should
 					 * be done.  */
-					u32 bit = __builtin_ctz(e8_mask);
+					int bit = ffs32(e8_mask);
 
 					/* Do (or undo) the e8 translation.  */
-					u8 *p8 = (u8 *)p128 + bit;
-					(*process_target)(p8 + 1,
-							  p8 - data);
+					(*process_target)(p + bit + 1,
+							  p + bit - data);
 
 					/* Don't start an e8 translation in the
 					 * next 4 bytes.  */
@@ -260,30 +260,27 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32))
 				valid_mask >>= 16;
 				valid_mask |= 0xFFFF0000;
 			}
-		} while (++p128 < end128);
-	}
+		} while ((p += 16) < vec_end);
 
-	u8 *p8 = (u8 *)p128;
-	while (!(valid_mask & 1)) {
-		p8++;
-		valid_mask >>= 1;
+		while (!(valid_mask & 1)) {
+			p++;
+			valid_mask >>= 1;
+		}
 	}
-#else /* __SSE2__  */
-	u8 *p8 = data;
 #endif /* !__SSE2__  */
 
 	if (size > 10) {
 		/* Finish any bytes that weren't processed by the vectorized
 		 * implementation.  */
-		u8 *p8_end = data + size - 10;
+		u8 *end = data + size - 10;
 		do {
-			if (*p8 == 0xe8) {
-				(*process_target)(p8 + 1, p8 - data);
-				p8 += 5;
+			if (*p == 0xe8) {
+				(*process_target)(p + 1, p - data);
+				p += 5;
 			} else {
-				p8++;
+				p++;
 			}
-		} while (p8 < p8_end);
+		} while (p < end);
 	}
 }
 
diff --git a/src/lzx-compress.c b/src/lzx-compress.c
index 2cb35e3a..201a1cfb 100644
--- a/src/lzx-compress.c
+++ b/src/lzx-compress.c
@@ -357,7 +357,7 @@ lzx_write_varbits(struct lzx_output_bitstream *os,
 
 		/* Write a coding unit, unless it would overflow the buffer.  */
 		if (os->next != os->end)
-			*os->next++ = cpu_to_le16(os->bitbuf >> os->bitcount);
+			put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next++);
 
 		/* If writing 17 bits, a second coding unit might need to be
 		 * written.  But because 'max_num_bits' is a compile-time
@@ -365,7 +365,7 @@ lzx_write_varbits(struct lzx_output_bitstream *os,
 		 * call sites.  */
 		if (max_num_bits == 17 && os->bitcount == 16) {
 			if (os->next != os->end)
-				*os->next++ = cpu_to_le16(os->bitbuf);
+				put_unaligned_u16_le(os->bitbuf, os->next++);
 			os->bitcount = 0;
 		}
 	}
@@ -391,7 +391,7 @@ lzx_flush_output(struct lzx_output_bitstream *os)
 		return 0;
 
 	if (os->bitcount != 0)
-		*os->next++ = cpu_to_le16(os->bitbuf << (16 - os->bitcount));
+		put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next++);
 
 	return (const u8 *)os->next - (const u8 *)os->start;
 }
diff --git a/src/lzx-decompress.c b/src/lzx-decompress.c
index 4c7694b6..2c466512 100644
--- a/src/lzx-decompress.c
+++ b/src/lzx-decompress.c
@@ -500,7 +500,8 @@ lzx_decompress_block(int block_type, u32 block_size,
 		if (unlikely(match_offset > window_ptr - window))
 			return -1;
 
-		lz_copy(window_ptr, match_len, match_offset, window_end);
+		lz_copy(window_ptr, match_len, match_offset, window_end,
+			LZX_MIN_MATCH_LEN);
 
 		window_ptr += match_len;
 	}
diff --git a/src/resource.c b/src/resource.c
index fb5761d9..20e27e17 100644
--- a/src/resource.c
+++ b/src/resource.c
@@ -26,6 +26,7 @@
 #endif
 
 #include "wimlib/assert.h"
+#include "wimlib/bitops.h"
 #include "wimlib/endianness.h"
 #include "wimlib/error.h"
 #include "wimlib/file_io.h"
@@ -214,7 +215,7 @@ read_compressed_wim_resource(const struct wim_resource_spec * const rspec,
 		}
 	}
 
-	const u32 chunk_order = bsr32(chunk_size);
+	const u32 chunk_order = fls32(chunk_size);
 
 	/* Calculate the total number of chunks the resource is divided into.  */
 	const u64 num_chunks = (rspec->uncompressed_size + chunk_size - 1) >> chunk_order;
@@ -326,8 +327,8 @@ read_compressed_wim_resource(const struct wim_resource_spec * const rspec,
 		/* Now fill in chunk_offsets from the entries we have read in
 		 * chunk_tab_data.  We break aliasing rules here to avoid having
 		 * to allocate yet another array.  */
-		typedef le64 __attribute__((may_alias)) aliased_le64_t;
-		typedef le32 __attribute__((may_alias)) aliased_le32_t;
+		typedef le64 _may_alias_attribute aliased_le64_t;
+		typedef le32 _may_alias_attribute aliased_le32_t;
 		u64 * chunk_offsets_p = chunk_offsets;
 
 		if (alt_chunk_table) {
diff --git a/src/util.c b/src/util.c
index 6e9ae23e..3b13c503 100644
--- a/src/util.c
+++ b/src/util.c
@@ -23,12 +23,15 @@
 #  include "config.h"
 #endif
 
+/* Make sure the POSIX-compatible strerror_r() is declared, rather than the GNU
+ * version, which has a different return type. */
 #ifdef _GNU_SOURCE
 #  define _GNU_SOURCE_DEFINED 1
 #  undef _GNU_SOURCE
+#  ifndef _POSIX_C_SOURCE
+#    define _POSIX_C_SOURCE 200112L
+#  endif
 #endif
-/* Make sure the POSIX-compatible strerror_r() is declared, rather than the GNU
- * version, which has a different return type. */
 #include <string.h>
 #ifdef _GNU_SOURCE_DEFINED
 #  define _GNU_SOURCE
diff --git a/src/wim.c b/src/wim.c
index 8422d9e1..51fc1f04 100644
--- a/src/wim.c
+++ b/src/wim.c
@@ -24,6 +24,7 @@
 #endif
 
 #include "wimlib.h"
+#include "wimlib/bitops.h"
 #include "wimlib/dentry.h"
 #include "wimlib/encoding.h"
 #include "wimlib/file_io.h"
@@ -96,7 +97,7 @@ wim_chunk_size_valid(u32 chunk_size, int ctype)
 	/* Chunk size must be power of 2.  */
 	if (chunk_size == 0)
 		return false;
-	order = bsr32(chunk_size);
+	order = fls32(chunk_size);
 	if (chunk_size != 1U << order)
 		return false;
 
diff --git a/src/write.c b/src/write.c
index 908de9ad..d8ce2dc3 100644
--- a/src/write.c
+++ b/src/write.c
@@ -517,8 +517,8 @@ end_chunk_table(struct write_streams_ctx *ctx, u64 res_actual_size,
 						0 != (ctx->write_resource_flags &
 						      WRITE_RESOURCE_FLAG_PACK_STREAMS));
 
-	typedef le64 __attribute__((may_alias)) aliased_le64_t;
-	typedef le32 __attribute__((may_alias)) aliased_le32_t;
+	typedef le64 _may_alias_attribute aliased_le64_t;
+	typedef le32 _may_alias_attribute aliased_le32_t;
 
 	if (chunk_entry_size == 4) {
 		aliased_le32_t *entries = (aliased_le32_t*)ctx->chunk_csizes;
diff --git a/src/xpress-compress.c b/src/xpress-compress.c
index d59de36a..fdcc2fca 100644
--- a/src/xpress-compress.c
+++ b/src/xpress-compress.c
@@ -26,11 +26,13 @@
 #  include "config.h"
 #endif
 
+#include "wimlib/bitops.h"
 #include "wimlib/compress_common.h"
 #include "wimlib/compressor_ops.h"
 #include "wimlib/endianness.h"
 #include "wimlib/error.h"
 #include "wimlib/lz_mf.h"
+#include "wimlib/unaligned.h"
 #include "wimlib/util.h"
 #include "wimlib/xpress.h"
 
@@ -216,7 +218,7 @@ xpress_write_bits(struct xpress_output_bitstream *os,
 	if (os->bitcount > 16) {
 		os->bitcount -= 16;
 		if (os->end - os->next_byte >= 2) {
-			*(le16 *)os->next_bits = cpu_to_le16(os->bitbuf >> os->bitcount);
+			put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next_bits);
 			os->next_bits = os->next_bits2;
 			os->next_bits2 = os->next_byte;
 			os->next_byte += 2;
@@ -244,8 +246,8 @@ xpress_flush_output(struct xpress_output_bitstream *os)
 	if (unlikely(os->end - os->next_byte < 2))
 		return 0;
 
-	*(le16 *)os->next_bits = cpu_to_le16(os->bitbuf << (16 - os->bitcount));
-	*(le16 *)os->next_bits2 = cpu_to_le16(0);
+	put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next_bits);
+	put_unaligned_u16_le(0, os->next_bits2);
 
 	return os->next_byte - os->start;
 }
@@ -332,8 +334,8 @@ xpress_declare_match(struct xpress_compressor *c,
 {
 	unsigned adjusted_len = len - XPRESS_MIN_MATCH_LEN;
 	unsigned len_hdr = min(adjusted_len, 0xf);
-	unsigned offset_bsr = bsr32(offset);
-	unsigned sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
+	unsigned offset_high_bit = fls32(offset);
+	unsigned sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
 
 	c->freqs[sym]++;
 
@@ -341,8 +343,8 @@ xpress_declare_match(struct xpress_compressor *c,
 		*(*next_chosen_item)++ = (struct xpress_item) {
 			.data = (u64)sym |
 				((u64)adjusted_len << 9) |
-				((u64)offset_bsr << 25) |
-				((u64)(offset ^ (1U << offset_bsr)) << 29),
+				((u64)offset_high_bit << 25) |
+				((u64)(offset ^ (1U << offset_high_bit)) << 29),
 		};
 	}
 }
@@ -605,7 +607,7 @@ xpress_consider_matches(struct xpress_compressor *c,
 	u32 cost;
 	u32 position_cost;
 	unsigned offset;
-	unsigned offset_bsr;
+	unsigned offset_high_bit;
 	unsigned adjusted_len;
 	unsigned len_hdr;
 	unsigned sym;
@@ -614,11 +616,11 @@ xpress_consider_matches(struct xpress_compressor *c,
 		/* All lengths are small.  Optimize accordingly.  */
 		do {
 			offset = matches[i].offset;
-			offset_bsr = bsr32(offset);
+			offset_high_bit = fls32(offset);
 			len_hdr = len - XPRESS_MIN_MATCH_LEN;
-			sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
+			sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
 
-			position_cost = cur_optimum_ptr->cost + offset_bsr;
+			position_cost = cur_optimum_ptr->cost + offset_high_bit;
 			do {
 				cost = position_cost + c->costs[sym];
 				if (cost < (cur_optimum_ptr + len)->cost) {
@@ -633,12 +635,12 @@ xpress_consider_matches(struct xpress_compressor *c,
 		/* Some lengths are big.  */
 		do {
 			offset = matches[i].offset;
-			offset_bsr = bsr32(offset);
-			position_cost = cur_optimum_ptr->cost + offset_bsr;
+			offset_high_bit = fls32(offset);
+			position_cost = cur_optimum_ptr->cost + offset_high_bit;
 			do {
 				adjusted_len = len - XPRESS_MIN_MATCH_LEN;
 				len_hdr = min(adjusted_len, 0xf);
-				sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
+				sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
 
 				cost = position_cost + c->costs[sym];
 				if (adjusted_len >= 0xf) {
diff --git a/src/xpress-decompress.c b/src/xpress-decompress.c
index 123613d6..b43e5273 100644
--- a/src/xpress-decompress.c
+++ b/src/xpress-decompress.c
@@ -84,7 +84,7 @@ xpress_decode_window(struct input_bitstream *istream, const u16 *decode_table,
 	u8 *window_end = &window[window_size];
 	unsigned sym;
 	unsigned match_len;
-	unsigned offset_bsr;
+	unsigned offset_high_bit;
 	unsigned match_offset;
 
 	while (window_ptr != window_end) {
@@ -99,12 +99,12 @@ xpress_decode_window(struct input_bitstream *istream, const u16 *decode_table,
 
 		/* Match  */
 		match_len = sym & 0xf;
-		offset_bsr = (sym >> 4) & 0xf;
+		offset_high_bit = (sym >> 4) & 0xf;
 
 		bitstream_ensure_bits(istream, 16);
 
-		match_offset = (1 << offset_bsr) |
-				bitstream_pop_bits(istream, offset_bsr);
+		match_offset = (1 << offset_high_bit) |
+				bitstream_pop_bits(istream, offset_high_bit);
 
 		if (match_len == 0xf) {
 			match_len += bitstream_read_byte(istream);
@@ -119,7 +119,8 @@ xpress_decode_window(struct input_bitstream *istream, const u16 *decode_table,
 		if (unlikely(match_len > window_end - window_ptr))
 			return -1;
 
-		lz_copy(window_ptr, match_len, match_offset, window_end);
+		lz_copy(window_ptr, match_len, match_offset, window_end,
+			XPRESS_MIN_MATCH_LEN);
 
 		window_ptr += match_len;
 	}
-- 
2.43.0