From: Eric Biggers <ebiggers3@gmail.com>
Date: Sat, 9 Jul 2016 15:01:26 +0000 (-0500)
Subject: Stop force-inlining everything marked 'inline'
X-Git-Tag: v1.10.0~22
X-Git-Url: https://wimlib.net/git/?p=wimlib;a=commitdiff_plain;h=4a20aae0dd8469a352517a0b107416ffa99ccc55

Stop force-inlining everything marked 'inline'

Instead, replace 'inline' with 'forceinline' in selected places.
---

diff --git a/include/wimlib/bitops.h b/include/wimlib/bitops.h
index 2c905057..1fc30f6c 100644
--- a/include/wimlib/bitops.h
+++ b/include/wimlib/bitops.h
@@ -30,7 +30,7 @@
  * input value must be nonzero!
  */
 
-static inline unsigned
+static forceinline unsigned
 bsr32(u32 v)
 {
 #ifdef compiler_bsr32
@@ -43,7 +43,7 @@ bsr32(u32 v)
 #endif
 }
 
-static inline unsigned
+static forceinline unsigned
 bsr64(u64 v)
 {
 #ifdef compiler_bsr64
@@ -56,7 +56,7 @@ bsr64(u64 v)
 #endif
 }
 
-static inline unsigned
+static forceinline unsigned
 bsrw(machine_word_t v)
 {
 	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
@@ -72,7 +72,7 @@ bsrw(machine_word_t v)
  * input value must be nonzero!
  */
 
-static inline unsigned
+static forceinline unsigned
 bsf32(u32 v)
 {
 #ifdef compiler_bsf32
@@ -85,7 +85,7 @@ bsf32(u32 v)
 #endif
 }
 
-static inline unsigned
+static forceinline unsigned
 bsf64(u64 v)
 {
 #ifdef compiler_bsf64
@@ -98,7 +98,7 @@ bsf64(u64 v)
 #endif
 }
 
-static inline unsigned
+static forceinline unsigned
 bsfw(machine_word_t v)
 {
 	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
@@ -109,7 +109,7 @@ bsfw(machine_word_t v)
 }
 
 /* Return the log base 2 of 'n', rounded up to the nearest integer. */
-static inline unsigned
+static forceinline unsigned
 ilog2_ceil(size_t n)
 {
         if (n <= 1)
@@ -118,7 +118,7 @@ ilog2_ceil(size_t n)
 }
 
 /* Round 'n' up to the nearest power of 2 */
-static inline size_t
+static forceinline size_t
 roundup_pow_of_2(size_t n)
 {
 	return (size_t)1 << ilog2_ceil(n);
diff --git a/include/wimlib/bt_matchfinder.h b/include/wimlib/bt_matchfinder.h
index 39a2778b..05bd07d9 100644
--- a/include/wimlib/bt_matchfinder.h
+++ b/include/wimlib/bt_matchfinder.h
@@ -108,7 +108,7 @@ struct TEMPLATED(bt_matchfinder) {
 
 /* Return the number of bytes that must be allocated for a 'bt_matchfinder' that
  * can work with buffers up to the specified size.  */
-static inline size_t
+static forceinline size_t
 TEMPLATED(bt_matchfinder_size)(size_t max_bufsize)
 {
 	return sizeof(struct TEMPLATED(bt_matchfinder)) +
@@ -116,19 +116,19 @@ TEMPLATED(bt_matchfinder_size)(size_t max_bufsize)
 }
 
 /* Prepare the matchfinder for a new input buffer.  */
-static inline void
+static forceinline void
 TEMPLATED(bt_matchfinder_init)(struct TEMPLATED(bt_matchfinder) *mf)
 {
 	memset(mf, 0, sizeof(*mf));
 }
 
-static inline mf_pos_t *
+static forceinline mf_pos_t *
 TEMPLATED(bt_left_child)(struct TEMPLATED(bt_matchfinder) *mf, u32 node)
 {
 	return &mf->child_tab[(node << 1) + 0];
 }
 
-static inline mf_pos_t *
+static forceinline mf_pos_t *
 TEMPLATED(bt_right_child)(struct TEMPLATED(bt_matchfinder) *mf, u32 node)
 {
 	return &mf->child_tab[(node << 1) + 1];
@@ -141,7 +141,7 @@ TEMPLATED(bt_right_child)(struct TEMPLATED(bt_matchfinder) *mf, u32 node)
 
 /* Advance the binary tree matchfinder by one byte, optionally recording
  * matches.  @record_matches should be a compile-time constant.  */
-static inline struct lz_match *
+static forceinline struct lz_match *
 TEMPLATED(bt_matchfinder_advance_one_byte)(struct TEMPLATED(bt_matchfinder) * const restrict mf,
 					   const u8 * const restrict in_begin,
 					   const ptrdiff_t cur_pos,
@@ -323,7 +323,7 @@ TEMPLATED(bt_matchfinder_advance_one_byte)(struct TEMPLATED(bt_matchfinder) * co
  * The return value is a pointer to the next available slot in the @lz_matchptr
  * array.  (If no matches were found, this will be the same as @lz_matchptr.)
  */
-static inline struct lz_match *
+static forceinline struct lz_match *
 TEMPLATED(bt_matchfinder_get_matches)(struct TEMPLATED(bt_matchfinder) *mf,
 				      const u8 *in_begin,
 				      ptrdiff_t cur_pos,
@@ -352,7 +352,7 @@ TEMPLATED(bt_matchfinder_get_matches)(struct TEMPLATED(bt_matchfinder) *mf,
  * This is very similar to bt_matchfinder_get_matches() because both functions
  * must do hashing and tree re-rooting.
  */
-static inline void
+static forceinline void
 TEMPLATED(bt_matchfinder_skip_position)(struct TEMPLATED(bt_matchfinder) *mf,
 					const u8 *in_begin,
 					ptrdiff_t cur_pos,
diff --git a/include/wimlib/compiler.h b/include/wimlib/compiler.h
index 6bce5d01..2b1923c2 100644
--- a/include/wimlib/compiler.h
+++ b/include/wimlib/compiler.h
@@ -50,10 +50,9 @@
 #  define WIMLIBAPI __attribute__((visibility("default")))
 #endif
 
-/* Declare that the annotated function should be inlined.  Currently, we force
- * the compiler to honor this because we use 'inline' in highly tuned code, e.g.
- * compression codecs.  */
-#define inline			inline __attribute__((always_inline))
+/* Declare that the annotated function should always be inlined.  This might be
+ * desirable in highly tuned code, e.g. compression codecs.  */
+#define forceinline		inline __attribute__((always_inline))
 
 /* Declare that the annotated function should *not* be inlined.  */
 #define noinline		__attribute__((noinline))
diff --git a/include/wimlib/decompress_common.h b/include/wimlib/decompress_common.h
index d20085db..65eb2e4b 100644
--- a/include/wimlib/decompress_common.h
+++ b/include/wimlib/decompress_common.h
@@ -54,7 +54,7 @@ struct input_bitstream {
 };
 
 /* Initialize a bitstream to read from the specified input buffer.  */
-static inline void
+static forceinline void
 init_input_bitstream(struct input_bitstream *is, const void *buffer, u32 size)
 {
 	is->bitbuf = 0;
@@ -73,7 +73,7 @@ init_input_bitstream(struct input_bitstream *is, const void *buffer, u32 size)
 /* Ensure the bit buffer variable for the bitstream contains at least @num_bits
  * bits.  Following this, bitstream_peek_bits() and/or bitstream_remove_bits()
  * may be called on the bitstream to peek or remove up to @num_bits bits.  */
-static inline void
+static forceinline void
 bitstream_ensure_bits(struct input_bitstream *is, const unsigned num_bits)
 {
 	/* This currently works for at most 17 bits.  */
@@ -106,7 +106,7 @@ overflow:
 /* Return the next @num_bits bits from the bitstream, without removing them.
  * There must be at least @num_bits remaining in the buffer variable, from a
  * previous call to bitstream_ensure_bits().  */
-static inline u32
+static forceinline u32
 bitstream_peek_bits(const struct input_bitstream *is, const unsigned num_bits)
 {
 	return (is->bitbuf >> 1) >> (sizeof(is->bitbuf) * 8 - num_bits - 1);
@@ -115,7 +115,7 @@ bitstream_peek_bits(const struct input_bitstream *is, const unsigned num_bits)
 /* Remove @num_bits from the bitstream.  There must be at least @num_bits
  * remaining in the buffer variable, from a previous call to
  * bitstream_ensure_bits().  */
-static inline void
+static forceinline void
 bitstream_remove_bits(struct input_bitstream *is, unsigned num_bits)
 {
 	is->bitbuf <<= num_bits;
@@ -125,7 +125,7 @@ bitstream_remove_bits(struct input_bitstream *is, unsigned num_bits)
 /* Remove and return @num_bits bits from the bitstream.  There must be at least
  * @num_bits remaining in the buffer variable, from a previous call to
  * bitstream_ensure_bits().  */
-static inline u32
+static forceinline u32
 bitstream_pop_bits(struct input_bitstream *is, unsigned num_bits)
 {
 	u32 bits = bitstream_peek_bits(is, num_bits);
@@ -134,7 +134,7 @@ bitstream_pop_bits(struct input_bitstream *is, unsigned num_bits)
 }
 
 /* Read and return the next @num_bits bits from the bitstream.  */
-static inline u32
+static forceinline u32
 bitstream_read_bits(struct input_bitstream *is, unsigned num_bits)
 {
 	bitstream_ensure_bits(is, num_bits);
@@ -142,7 +142,7 @@ bitstream_read_bits(struct input_bitstream *is, unsigned num_bits)
 }
 
 /* Read and return the next literal byte embedded in the bitstream.  */
-static inline u8
+static forceinline u8
 bitstream_read_byte(struct input_bitstream *is)
 {
 	if (unlikely(is->end == is->next))
@@ -151,7 +151,7 @@ bitstream_read_byte(struct input_bitstream *is)
 }
 
 /* Read and return the next 16-bit integer embedded in the bitstream.  */
-static inline u16
+static forceinline u16
 bitstream_read_u16(struct input_bitstream *is)
 {
 	u16 v;
@@ -164,7 +164,7 @@ bitstream_read_u16(struct input_bitstream *is)
 }
 
 /* Read and return the next 32-bit integer embedded in the bitstream.  */
-static inline u32
+static forceinline u32
 bitstream_read_u32(struct input_bitstream *is)
 {
 	u32 v;
@@ -178,7 +178,7 @@ bitstream_read_u32(struct input_bitstream *is)
 
 /* Read into @dst_buffer an array of literal bytes embedded in the bitstream.
  * Return 0 if there were enough bytes remaining in the input, otherwise -1. */
-static inline int
+static forceinline int
 bitstream_read_bytes(struct input_bitstream *is, void *dst_buffer, size_t count)
 {
 	if (unlikely(is->end - is->next < count))
@@ -189,7 +189,7 @@ bitstream_read_bytes(struct input_bitstream *is, void *dst_buffer, size_t count)
 }
 
 /* Align the input bitstream on a coding-unit boundary.  */
-static inline void
+static forceinline void
 bitstream_align(struct input_bitstream *is)
 {
 	is->bitsleft = 0;
@@ -242,7 +242,7 @@ bitstream_align(struct input_bitstream *is)
  * XXX: This is mostly duplicated in lzms_decode_huffman_symbol() in
  * lzms_decompress.c; keep them in sync!
  */
-static inline unsigned
+static forceinline unsigned
 read_huffsym(struct input_bitstream *is, const u16 decode_table[],
 	     unsigned table_bits, unsigned max_codeword_len)
 {
@@ -414,13 +414,13 @@ make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
 /*                             LZ match copying                               */
 /*----------------------------------------------------------------------------*/
 
-static inline void
+static forceinline void
 copy_word_unaligned(const void *src, void *dst)
 {
 	store_word_unaligned(load_word_unaligned(src), dst);
 }
 
-static inline machine_word_t
+static forceinline machine_word_t
 repeat_u16(u16 b)
 {
 	machine_word_t v = b;
@@ -431,7 +431,7 @@ repeat_u16(u16 b)
 	return v;
 }
 
-static inline machine_word_t
+static forceinline machine_word_t
 repeat_byte(u8 b)
 {
 	return repeat_u16(((u16)b << 8) | b);
@@ -450,7 +450,7 @@ repeat_byte(u8 b)
  * 'min_length' is a hint which specifies the minimum possible match length.
  * This should be a compile-time constant.
  */
-static inline int
+static forceinline int
 lz_copy(u32 length, u32 offset, u8 *out_begin, u8 *out_next, u8 *out_end,
 	u32 min_length)
 {
diff --git a/include/wimlib/endianness.h b/include/wimlib/endianness.h
index 9cea963b..ed0b7ec4 100644
--- a/include/wimlib/endianness.h
+++ b/include/wimlib/endianness.h
@@ -47,7 +47,7 @@
 	 (((u64)(n) & 0x00FF000000000000) >> 40)	|	\
 	 (((u64)(n) & 0xFF00000000000000) >> 56))
 
-static inline u16 do_bswap16(u16 n)
+static forceinline u16 do_bswap16(u16 n)
 {
 #ifdef compiler_bswap16
 	return compiler_bswap16(n);
@@ -56,7 +56,7 @@ static inline u16 do_bswap16(u16 n)
 #endif
 }
 
-static inline u32 do_bswap32(u32 n)
+static forceinline u32 do_bswap32(u32 n)
 {
 #ifdef compiler_bswap32
 	return compiler_bswap32(n);
@@ -65,7 +65,7 @@ static inline u32 do_bswap32(u32 n)
 #endif
 }
 
-static inline u64 do_bswap64(u64 n)
+static forceinline u64 do_bswap64(u64 n)
 {
 #ifdef compiler_bswap64
 	return compiler_bswap64(n);
diff --git a/include/wimlib/hc_matchfinder.h b/include/wimlib/hc_matchfinder.h
index 1f552db2..aa2e4542 100644
--- a/include/wimlib/hc_matchfinder.h
+++ b/include/wimlib/hc_matchfinder.h
@@ -141,7 +141,7 @@ struct TEMPLATED(hc_matchfinder) {
 
 /* Return the number of bytes that must be allocated for a 'hc_matchfinder' that
  * can work with buffers up to the specified size.  */
-static inline size_t
+static forceinline size_t
 TEMPLATED(hc_matchfinder_size)(size_t max_bufsize)
 {
 	return sizeof(struct TEMPLATED(hc_matchfinder)) +
@@ -149,7 +149,7 @@ TEMPLATED(hc_matchfinder_size)(size_t max_bufsize)
 }
 
 /* Prepare the matchfinder for a new input buffer.  */
-static inline void
+static forceinline void
 TEMPLATED(hc_matchfinder_init)(struct TEMPLATED(hc_matchfinder) *mf)
 {
 	memset(mf, 0, sizeof(*mf));
@@ -184,7 +184,7 @@ TEMPLATED(hc_matchfinder_init)(struct TEMPLATED(hc_matchfinder) *mf)
  * Return the length of the match found, or 'best_len' if no match longer than
  * 'best_len' was found.
  */
-static inline u32
+static forceinline u32
 TEMPLATED(hc_matchfinder_longest_match)(struct TEMPLATED(hc_matchfinder) * const restrict mf,
 					const u8 * const restrict in_begin,
 					const ptrdiff_t cur_pos,
@@ -353,7 +353,7 @@ out:
  *
  * Returns @in_next + @count.
  */
-static inline const u8 *
+static forceinline const u8 *
 TEMPLATED(hc_matchfinder_skip_positions)(struct TEMPLATED(hc_matchfinder) * const restrict mf,
 					 const u8 * const restrict in_begin,
 					 const ptrdiff_t cur_pos,
diff --git a/include/wimlib/lz_extend.h b/include/wimlib/lz_extend.h
index cbbe88fd..26f0ce5c 100644
--- a/include/wimlib/lz_extend.h
+++ b/include/wimlib/lz_extend.h
@@ -28,7 +28,7 @@
  * Return the number of bytes at @matchptr that match the bytes at @strptr, up
  * to a maximum of @max_len.  Initially, @len bytes are matched.
  */
-static inline u32
+static forceinline u32
 lz_extend(const u8 * const strptr, const u8 * const matchptr,
 	  u32 len, const u32 max_len)
 {
diff --git a/include/wimlib/lz_hash.h b/include/wimlib/lz_hash.h
index 7416585a..f7618152 100644
--- a/include/wimlib/lz_hash.h
+++ b/include/wimlib/lz_hash.h
@@ -30,7 +30,7 @@
  * next-highest @num_bits bits of the product as the hash value, as those have
  * the most randomness.
  */
-static inline u32
+static forceinline u32
 lz_hash(u32 seq, unsigned num_bits)
 {
 	return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
diff --git a/include/wimlib/lzms_common.h b/include/wimlib/lzms_common.h
index 6db35b77..b5071469 100644
--- a/include/wimlib/lzms_common.h
+++ b/include/wimlib/lzms_common.h
@@ -23,14 +23,14 @@ extern unsigned
 lzms_get_slot(u32 value, const u32 slot_base_tab[], unsigned num_slots);
 
 /* Return the offset slot for the specified offset  */
-static inline unsigned
+static forceinline unsigned
 lzms_get_offset_slot(u32 offset)
 {
 	return lzms_get_slot(offset, lzms_offset_slot_base, LZMS_MAX_NUM_OFFSET_SYMS);
 }
 
 /* Return the length slot for the specified length  */
-static inline unsigned
+static forceinline unsigned
 lzms_get_length_slot(u32 length)
 {
 	return lzms_get_slot(length, lzms_length_slot_base, LZMS_NUM_LENGTH_SYMS);
@@ -71,7 +71,7 @@ extern void
 lzms_init_probabilities(struct lzms_probabilites *probs);
 
 /* Given a decoded or encoded bit, update the probability entry.  */
-static inline void
+static forceinline void
 lzms_update_probability_entry(struct lzms_probability_entry *entry, int bit)
 {
 	STATIC_ASSERT(LZMS_PROBABILITY_DENOMINATOR == sizeof(entry->recent_bits) * 8);
@@ -108,7 +108,7 @@ lzms_update_probability_entry(struct lzms_probability_entry *entry, int bit)
 
 /* Given a probability entry, return the chance out of
  * LZMS_PROBABILITY_DENOMINATOR that the next decoded bit will be a 0.  */
-static inline u32
+static forceinline u32
 lzms_get_probability(const struct lzms_probability_entry *prob_entry)
 {
 	u32 prob = prob_entry->num_recent_zero_bits;
diff --git a/include/wimlib/unaligned.h b/include/wimlib/unaligned.h
index cc9f27f1..ead46295 100644
--- a/include/wimlib/unaligned.h
+++ b/include/wimlib/unaligned.h
@@ -30,13 +30,13 @@ struct type##_unaligned {					\
 	type v;							\
 } _packed_attribute;						\
 								\
-static inline type						\
+static forceinline type						\
 load_##type##_unaligned(const void *p)				\
 {								\
 	return ((const struct type##_unaligned *)p)->v;		\
 }								\
 								\
-static inline void						\
+static forceinline void						\
 store_##type##_unaligned(type val, void *p)			\
 {								\
 	((struct type##_unaligned *)p)->v = val;		\
@@ -57,7 +57,7 @@ DEFINE_UNALIGNED_TYPE(machine_word_t);
 #define load_word_unaligned	load_machine_word_t_unaligned
 #define store_word_unaligned	store_machine_word_t_unaligned
 
-static inline u16
+static forceinline u16
 get_unaligned_le16(const u8 *p)
 {
 	if (UNALIGNED_ACCESS_IS_FAST)
@@ -66,7 +66,7 @@ get_unaligned_le16(const u8 *p)
 		return ((u16)p[1] << 8) | p[0];
 }
 
-static inline u32
+static forceinline u32
 get_unaligned_le32(const u8 *p)
 {
 	if (UNALIGNED_ACCESS_IS_FAST)
@@ -76,7 +76,7 @@ get_unaligned_le32(const u8 *p)
 			((u32)p[1] << 8) | p[0];
 }
 
-static inline void
+static forceinline void
 put_unaligned_le16(u16 v, u8 *p)
 {
 	if (UNALIGNED_ACCESS_IS_FAST) {
@@ -87,7 +87,7 @@ put_unaligned_le16(u16 v, u8 *p)
 	}
 }
 
-static inline void
+static forceinline void
 put_unaligned_le32(u32 v, u8 *p)
 {
 	if (UNALIGNED_ACCESS_IS_FAST) {
@@ -106,7 +106,7 @@ put_unaligned_le32(u32 v, u8 *p)
  * bits contain the first 3 bytes, arranged in octets in a platform-dependent
  * order, at the memory location from which the input 32-bit value was loaded.
  */
-static inline u32
+static forceinline u32
 loaded_u32_to_u24(u32 v)
 {
 	if (CPU_IS_LITTLE_ENDIAN)
@@ -121,7 +121,7 @@ loaded_u32_to_u24(u32 v)
  * in the 24 bits is platform-dependent.  At least LOAD_U24_REQUIRED_NBYTES
  * bytes must be available at @p; note that this may be more than 3.
  */
-static inline u32
+static forceinline u32
 load_u24_unaligned(const u8 *p)
 {
 #if UNALIGNED_ACCESS_IS_FAST
diff --git a/src/divsufsort.c b/src/divsufsort.c
index 67536956..c80412f5 100644
--- a/src/divsufsort.c
+++ b/src/divsufsort.c
@@ -111,7 +111,7 @@ static const int lg_table[256]= {
 
 #if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
 
-static inline
+static forceinline
 int
 ss_ilg(int n) {
 #if SS_BLOCKSIZE == 0
@@ -154,7 +154,7 @@ static const int sqq_table[256] = {
 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255
 };
 
-static inline
+static forceinline
 int
 ss_isqrt(int x) {
   int y, e;
@@ -187,7 +187,7 @@ ss_isqrt(int x) {
 /*---------------------------------------------------------------------------*/
 
 /* Compares two suffixes. */
-static inline
+static forceinline
 int
 ss_compare(const unsigned char *T,
            const int *p1, const int *p2,
@@ -238,7 +238,7 @@ ss_insertionsort(const unsigned char *T, const int *PA,
 
 #if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
 
-static inline
+static forceinline
 void
 ss_fixdown(const unsigned char *Td, const int *PA,
            int *SA, int i, int size) {
@@ -280,7 +280,7 @@ ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) {
 /*---------------------------------------------------------------------------*/
 
 /* Returns the median of three elements. */
-static inline
+static forceinline
 int *
 ss_median3(const unsigned char *Td, const int *PA,
            int *v1, int *v2, int *v3) {
@@ -293,7 +293,7 @@ ss_median3(const unsigned char *Td, const int *PA,
 }
 
 /* Returns the median of five elements. */
-static inline
+static forceinline
 int *
 ss_median5(const unsigned char *Td, const int *PA,
            int *v1, int *v2, int *v3, int *v4, int *v5) {
@@ -307,7 +307,7 @@ ss_median5(const unsigned char *Td, const int *PA,
 }
 
 /* Returns the pivot element. */
-static inline
+static forceinline
 int *
 ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) {
   int *middle;
@@ -335,7 +335,7 @@ ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) {
 /*---------------------------------------------------------------------------*/
 
 /* Binary partition for substrings. */
-static inline
+static forceinline
 int *
 ss_partition(const int *PA,
                     int *first, int *last, int depth) {
@@ -496,7 +496,7 @@ ss_mintrosort(const unsigned char *T, const int *PA,
 
 #if SS_BLOCKSIZE != 0
 
-static inline
+static forceinline
 void
 ss_blockswap(int *a, int *b, int n) {
   int t;
@@ -505,7 +505,7 @@ ss_blockswap(int *a, int *b, int n) {
   }
 }
 
-static inline
+static forceinline
 void
 ss_rotate(int *first, int *middle, int *last) {
   int *a, *b, t;
@@ -865,7 +865,7 @@ sssort(const unsigned char *T, const int *PA,
 
 /*---------------------------------------------------------------------------*/
 
-static inline
+static forceinline
 int
 tr_ilg(int n) {
   return (n & 0xffff0000) ?
@@ -900,7 +900,7 @@ tr_insertionsort(const int *ISAd, int *first, int *last) {
 
 /*---------------------------------------------------------------------------*/
 
-static inline
+static forceinline
 void
 tr_fixdown(const int *ISAd, int *SA, int i, int size) {
   int j, k;
@@ -941,7 +941,7 @@ tr_heapsort(const int *ISAd, int *SA, int size) {
 /*---------------------------------------------------------------------------*/
 
 /* Returns the median of three elements. */
-static inline
+static forceinline
 int *
 tr_median3(const int *ISAd, int *v1, int *v2, int *v3) {
   if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); }
@@ -953,7 +953,7 @@ tr_median3(const int *ISAd, int *v1, int *v2, int *v3) {
 }
 
 /* Returns the median of five elements. */
-static inline
+static forceinline
 int *
 tr_median5(const int *ISAd,
            int *v1, int *v2, int *v3, int *v4, int *v5) {
@@ -967,7 +967,7 @@ tr_median5(const int *ISAd,
 }
 
 /* Returns the pivot element. */
-static inline
+static forceinline
 int *
 tr_pivot(const int *ISAd, int *first, int *last) {
   int *middle;
@@ -1002,14 +1002,14 @@ struct _trbudget_t {
   int count;
 };
 
-static inline
+static forceinline
 void
 trbudget_init(trbudget_t *budget, int chance, int incval) {
   budget->chance = chance;
   budget->remain = budget->incval = incval;
 }
 
-static inline
+static forceinline
 int
 trbudget_check(trbudget_t *budget, int size) {
   if(size <= budget->remain) { budget->remain -= size; return 1; }
@@ -1022,7 +1022,7 @@ trbudget_check(trbudget_t *budget, int size) {
 
 /*---------------------------------------------------------------------------*/
 
-static inline
+static forceinline
 void
 tr_partition(const int *ISAd,
              int *first, int *middle, int *last,
diff --git a/src/encoding.c b/src/encoding.c
index 6d40605b..9337c9a1 100644
--- a/src/encoding.c
+++ b/src/encoding.c
@@ -77,7 +77,7 @@ typedef unsigned (*decode_codepoint_fn)(const u8 *in, size_t remaining,
 /* Encode the Unicode codepoint @c and return the number of bytes used. */
 typedef unsigned (*encode_codepoint_fn)(u32 c, u8 *out);
 
-static inline unsigned
+static forceinline unsigned
 utf8_decode_codepoint(const u8 *in, size_t remaining, bool validate, u32 *c_ret)
 {
 	if (likely(in[0] < 0x80)) { /* U+0...U+7F */
@@ -124,7 +124,7 @@ invalid:
 	return 1;
 }
 
-static inline unsigned
+static forceinline unsigned
 utf8_encode_codepoint(u32 c, u8 *out)
 {
 	if (likely(c < 0x80)) {
@@ -152,7 +152,7 @@ utf8_encode_codepoint(u32 c, u8 *out)
 	return 4;
 }
 
-static inline unsigned
+static forceinline unsigned
 utf16le_decode_codepoint(const u8 *in, size_t remaining, bool validate,
 			 u32 *c_ret)
 {
@@ -188,7 +188,7 @@ invalid:
 	return min(remaining, 2);
 }
 
-static inline unsigned
+static forceinline unsigned
 utf16le_encode_codepoint(u32 c, u8 *out)
 {
 	if (likely(c < 0x10000)) {
@@ -213,7 +213,7 @@ utf16le_encode_codepoint(u32 c, u8 *out)
  * If the input string is malformed, return @ilseq_err with errno set to EILSEQ.
  * If out of memory, return WIMLIB_ERR_NOMEM with errno set to ENOMEM.
  */
-static inline int
+static forceinline int
 convert_string(const u8 * const in, const size_t in_nbytes,
 	       u8 **out_ret, size_t *out_nbytes_ret,
 	       int ilseq_err,
diff --git a/src/lcpit_matchfinder.c b/src/lcpit_matchfinder.c
index 2562bfb2..8b9ffd9d 100644
--- a/src/lcpit_matchfinder.c
+++ b/src/lcpit_matchfinder.c
@@ -284,7 +284,7 @@ build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n)
  * around by just continuing until we get to a link that actually takes us
  * higher in the tree.  This can be described as a lazy-update scheme.
  */
-static inline u32
+static forceinline u32
 lcpit_advance_one_byte(const u32 cur_pos,
 		       u32 pos_data[restrict],
 		       u32 intervals[restrict],
@@ -486,7 +486,7 @@ build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n)
 
 /* Like lcpit_advance_one_byte(), but for buffers larger than
  * MAX_NORMAL_BUFSIZE.  */
-static inline u32
+static forceinline u32
 lcpit_advance_one_byte_huge(const u32 cur_pos,
 			    u32 pos_data[restrict],
 			    u64 intervals64[restrict],
@@ -538,14 +538,14 @@ lcpit_advance_one_byte_huge(const u32 cur_pos,
 	return matchptr - matches;
 }
 
-static inline u64
+static forceinline u64
 get_pos_data_size(size_t max_bufsize)
 {
 	return (u64)max((u64)max_bufsize + PREFETCH_SAFETY,
 			DIVSUFSORT_TMP_LEN) * sizeof(u32);
 }
 
-static inline u64
+static forceinline u64
 get_intervals_size(size_t max_bufsize)
 {
 	return ((u64)max_bufsize + PREFETCH_SAFETY) *
diff --git a/src/lzms_common.c b/src/lzms_common.c
index 57c17d4d..380b7431 100644
--- a/src/lzms_common.c
+++ b/src/lzms_common.c
@@ -379,7 +379,7 @@ lzms_dilute_symbol_frequencies(u32 freqs[], unsigned num_syms)
 
 
 #ifdef __x86_64__
-static inline u8 *
+static forceinline u8 *
 find_next_opcode_sse4_2(u8 *p)
 {
 	const __v16qi potential_opcodes = (__v16qi) {0x48, 0x4C, 0xE8, 0xE9, 0xF0, 0xFF};
@@ -401,7 +401,7 @@ find_next_opcode_sse4_2(u8 *p)
 }
 #endif /* __x86_64__ */
 
-static inline u8 *
+static forceinline u8 *
 find_next_opcode_default(u8 *p)
 {
 	/*
@@ -433,7 +433,7 @@ find_next_opcode_default(u8 *p)
 	return p;
 }
 
-static inline u8 *
+static forceinline u8 *
 translate_if_needed(u8 *data, u8 *p, s32 *last_x86_pos,
 		    s32 last_target_usages[], bool undo)
 {
diff --git a/src/lzms_compress.c b/src/lzms_compress.c
index 8ee9e818..09999957 100644
--- a/src/lzms_compress.c
+++ b/src/lzms_compress.c
@@ -427,7 +427,7 @@ lzms_init_offset_slot_tabs(struct lzms_compressor *c)
  * Return the length slot for the specified match length, using the compressor's
  * acceleration table if the length is small enough.
  */
-static inline unsigned
+static forceinline unsigned
 lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
 {
 	if (likely(length <= MAX_FAST_LENGTH))
@@ -439,7 +439,7 @@ lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
  * Return the offset slot for the specified match offset, using the compressor's
  * acceleration tables to speed up the mapping.
  */
-static inline unsigned
+static forceinline unsigned
 lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
 {
 	if (offset < 0xe4a5)
@@ -529,7 +529,7 @@ lzms_range_encoder_flush(struct lzms_range_encoder *rc)
  * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
  * bit is 0 rather than 1.
  */
-static inline void
+static forceinline void
 lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
 {
 	/* Normalize if needed.  */
@@ -551,7 +551,7 @@ lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
  * Encode a bit.  This wraps around lzms_range_encode_bit() to handle using and
  * updating the state and its corresponding probability entry.
  */
-static inline void
+static forceinline void
 lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
 		struct lzms_probability_entry *probs,
 		struct lzms_range_encoder *rc)
@@ -644,7 +644,7 @@ lzms_output_bitstream_init(struct lzms_output_bitstream *os,
  * @max_num_bits is a compile-time constant that specifies the maximum number of
  * bits that can ever be written at this call site.
  */
-static inline void
+static forceinline void
 lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
 		const unsigned num_bits, const unsigned max_num_bits)
 {
@@ -725,7 +725,7 @@ lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
  * Encode a symbol using the specified Huffman code.  Then, if the Huffman code
  * needs to be rebuilt, rebuild it and return true; otherwise return false.
  */
-static inline bool
+static forceinline bool
 lzms_huffman_encode_symbol(unsigned sym,
 			   const u32 *codewords, const u8 *lens, u32 *freqs,
 			   struct lzms_output_bitstream *os,
@@ -936,7 +936,7 @@ lzms_encode_nonempty_item_list(struct lzms_compressor *c,
 	} while (cur_node != end_node);
 }
 
-static inline void
+static forceinline void
 lzms_encode_item_list(struct lzms_compressor *c,
 		      struct lzms_optimum_node *end_node)
 {
@@ -1003,14 +1003,14 @@ lzms_compute_bit_costs(void)
 #endif
 
 /* Return the cost to encode a 0 bit in the specified context.  */
-static inline u32
+static forceinline u32
 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
 {
 	return lzms_bit_costs[probs[state].num_recent_zero_bits];
 }
 
 /* Return the cost to encode a 1 bit in the specified context.  */
-static inline u32
+static forceinline u32
 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
 {
 	return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
@@ -1018,7 +1018,7 @@ lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
 }
 
 /* Return the cost to encode a literal, including the main bit.  */
-static inline u32
+static forceinline u32
 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
 {
 	return lzms_bit_0_cost(main_state, c->probs.main) +
@@ -1043,14 +1043,14 @@ lzms_update_fast_length_costs(struct lzms_compressor *c)
 
 /* Return the cost to encode the specified match length, which must not exceed
  * MAX_FAST_LENGTH.  */
-static inline u32
+static forceinline u32
 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
 {
 	return c->fast_length_cost_tab[length];
 }
 
 /* Return the cost to encode the specified LZ match offset.  */
-static inline u32
+static forceinline u32
 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
 {
 	unsigned slot = lzms_comp_get_offset_slot(c, offset);
@@ -1059,7 +1059,7 @@ lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
 }
 
 /* Return the cost to encode the specified delta power and raw offset.  */
-static inline u32
+static forceinline u32
 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
 {
 	unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
@@ -1122,31 +1122,31 @@ lzms_update_lru_queues(struct lzms_adaptive_state *state)
 	state->prev_delta_pair = state->upcoming_delta_pair;
 }
 
-static inline void
+static forceinline void
 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
 {
 	*state_p = ((*state_p << 1) | bit) & (num_states - 1);
 }
 
-static inline void
+static forceinline void
 lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
 {
 	lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
 }
 
-static inline void
+static forceinline void
 lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
 {
 	lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
 }
 
-static inline void
+static forceinline void
 lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
 {
 	lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
 }
 
-static inline void
+static forceinline void
 lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
 {
 	for (int i = 0; i < rep_idx; i++)
@@ -1156,13 +1156,13 @@ lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
 		lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
 }
 
-static inline void
+static forceinline void
 lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
 {
 	lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
 }
 
-static inline void
+static forceinline void
 lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
 {
 	for (int i = 0; i < rep_idx; i++)
@@ -1199,7 +1199,7 @@ lzms_init_delta_matchfinder(struct lzms_compressor *c)
  * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
  * delta context with the specified @span.
  */
-static inline u32
+static forceinline u32
 lzms_delta_hash(const u8 *p, const u32 pos, u32 span)
 {
 	/* A delta match has a certain span and an offset that is a multiple of
@@ -1222,7 +1222,7 @@ lzms_delta_hash(const u8 *p, const u32 pos, u32 span)
  * specified @span and having the initial @len, extend the match as far as
  * possible, up to a limit of @max_len.
  */
-static inline u32
+static forceinline u32
 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
 			u32 len, u32 max_len, u32 span)
 {
diff --git a/src/lzms_decompress.c b/src/lzms_decompress.c
index 2ef2debd..4dd36627 100644
--- a/src/lzms_decompress.c
+++ b/src/lzms_decompress.c
@@ -376,7 +376,7 @@ lzms_input_bitstream_init(struct lzms_input_bitstream *is,
 
 /* Ensure that at least @num_bits bits are in the bitbuffer variable.
  * @num_bits cannot be more than 32.  */
-static inline void
+static forceinline void
 lzms_ensure_bits(struct lzms_input_bitstream *is, unsigned num_bits)
 {
 	unsigned avail;
@@ -408,14 +408,14 @@ lzms_ensure_bits(struct lzms_input_bitstream *is, unsigned num_bits)
 }
 
 /* Get @num_bits bits from the bitbuffer variable.  */
-static inline bitbuf_t
+static forceinline bitbuf_t
 lzms_peek_bits(struct lzms_input_bitstream *is, unsigned num_bits)
 {
 	return (is->bitbuf >> 1) >> (BITBUF_NBITS - num_bits - 1);
 }
 
 /* Remove @num_bits bits from the bitbuffer variable.  */
-static inline void
+static forceinline void
 lzms_remove_bits(struct lzms_input_bitstream *is, unsigned num_bits)
 {
 	is->bitbuf <<= num_bits;
@@ -423,7 +423,7 @@ lzms_remove_bits(struct lzms_input_bitstream *is, unsigned num_bits)
 }
 
 /* Remove and return @num_bits bits from the bitbuffer variable.  */
-static inline bitbuf_t
+static forceinline bitbuf_t
 lzms_pop_bits(struct lzms_input_bitstream *is, unsigned num_bits)
 {
 	bitbuf_t bits = lzms_peek_bits(is, num_bits);
@@ -432,7 +432,7 @@ lzms_pop_bits(struct lzms_input_bitstream *is, unsigned num_bits)
 }
 
 /* Read @num_bits bits from the input bitstream.  */
-static inline bitbuf_t
+static forceinline bitbuf_t
 lzms_read_bits(struct lzms_input_bitstream *is, unsigned num_bits)
 {
 	lzms_ensure_bits(is, num_bits);
@@ -457,7 +457,7 @@ lzms_range_decoder_init(struct lzms_range_decoder *rd,
  * probability entry to use.  The state and probability entry will be updated
  * based on the decoded bit.
  */
-static inline int
+static forceinline int
 lzms_decode_bit(struct lzms_range_decoder *rd, u32 *state_p, u32 num_states,
 		struct lzms_probability_entry *probs)
 {
@@ -597,7 +597,7 @@ lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
 
 /* XXX: mostly copied from read_huffsym() in decompress_common.h because LZMS
  * needs its own bitstream */
-static inline unsigned
+static forceinline unsigned
 lzms_decode_huffman_symbol(struct lzms_input_bitstream *is, u16 decode_table[],
 			   unsigned table_bits, u32 freqs[],
 			   struct lzms_huffman_rebuild_info *rebuild_info)
@@ -627,7 +627,7 @@ lzms_decode_huffman_symbol(struct lzms_input_bitstream *is, u16 decode_table[],
 	return symbol;
 }
 
-static inline unsigned
+static forceinline unsigned
 lzms_decode_literal(struct lzms_decompressor *d,
 		    struct lzms_input_bitstream *is)
 {
@@ -638,7 +638,7 @@ lzms_decode_literal(struct lzms_decompressor *d,
 					  &d->literal_rebuild_info);
 }
 
-static inline u32
+static forceinline u32
 lzms_decode_lz_offset(struct lzms_decompressor *d,
 		      struct lzms_input_bitstream *is)
 {
@@ -651,7 +651,7 @@ lzms_decode_lz_offset(struct lzms_decompressor *d,
 	       lzms_read_bits(is, lzms_extra_offset_bits[slot]);
 }
 
-static inline u32
+static forceinline u32
 lzms_decode_length(struct lzms_decompressor *d,
 		   struct lzms_input_bitstream *is)
 {
@@ -668,7 +668,7 @@ lzms_decode_length(struct lzms_decompressor *d,
 	return length;
 }
 
-static inline u32
+static forceinline u32
 lzms_decode_delta_offset(struct lzms_decompressor *d,
 			 struct lzms_input_bitstream *is)
 {
@@ -681,7 +681,7 @@ lzms_decode_delta_offset(struct lzms_decompressor *d,
 	       lzms_read_bits(is, lzms_extra_offset_bits[slot]);
 }
 
-static inline unsigned
+static forceinline unsigned
 lzms_decode_delta_power(struct lzms_decompressor *d,
 			struct lzms_input_bitstream *is)
 {
diff --git a/src/lzx_compress.c b/src/lzx_compress.c
index 10b51902..21636d49 100644
--- a/src/lzx_compress.c
+++ b/src/lzx_compress.c
@@ -488,7 +488,7 @@ struct lzx_compressor {
  * This requires that the limit be no more than the length of offset_slot_tab_1
  * (currently 32768).
  */
-static inline bool
+static forceinline bool
 lzx_is_16_bit(size_t max_bufsize)
 {
 	STATIC_ASSERT(ARRAY_LEN(((struct lzx_compressor *)0)->offset_slot_tab_1) == 32768);
@@ -498,7 +498,7 @@ lzx_is_16_bit(size_t max_bufsize)
 /*
  * Return the offset slot for the specified adjusted match offset.
  */
-static inline unsigned
+static forceinline unsigned
 lzx_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset,
 		    bool is_16_bit)
 {
@@ -574,7 +574,7 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
  * Add some bits to the bitbuffer variable of the output bitstream.  The caller
  * must make sure there is enough room.
  */
-static inline void
+static forceinline void
 lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 {
 	os->bitbuf = (os->bitbuf << num_bits) | bits;
@@ -586,7 +586,7 @@ lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
  * specifies the maximum number of bits that may have been added since the last
  * flush.
  */
-static inline void
+static forceinline void
 lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
 {
 	/* Masking the number of bits to shift is only needed to avoid undefined
@@ -609,7 +609,7 @@ lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
 }
 
 /* Add at most 16 bits to the bitbuffer and flush it.  */
-static inline void
+static forceinline void
 lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 {
 	lzx_add_bits(os, bits, num_bits);
@@ -1218,7 +1218,7 @@ lzx_init_block_split_stats(struct lzx_block_split_stats *stats)
 
 /* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
  * literal, for 8 possible literal observation types.  */
-static inline void
+static forceinline void
 lzx_observe_literal(struct lzx_block_split_stats *stats, u8 lit)
 {
 	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
@@ -1227,7 +1227,7 @@ lzx_observe_literal(struct lzx_block_split_stats *stats, u8 lit)
 
 /* Match observation.  Heuristic: use one observation type for "short match" and
  * one observation type for "long match".  */
-static inline void
+static forceinline void
 lzx_observe_match(struct lzx_block_split_stats *stats, unsigned length)
 {
 	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
@@ -1298,26 +1298,26 @@ struct lzx_lru_queue {
 	((u64)1 << LZX_QUEUE_R1_SHIFT) |	\
 	((u64)1 << LZX_QUEUE_R2_SHIFT) }
 
-static inline u64
+static forceinline u64
 lzx_lru_queue_R0(struct lzx_lru_queue queue)
 {
 	return (queue.R >> LZX_QUEUE_R0_SHIFT) & LZX_QUEUE_OFFSET_MASK;
 }
 
-static inline u64
+static forceinline u64
 lzx_lru_queue_R1(struct lzx_lru_queue queue)
 {
 	return (queue.R >> LZX_QUEUE_R1_SHIFT) & LZX_QUEUE_OFFSET_MASK;
 }
 
-static inline u64
+static forceinline u64
 lzx_lru_queue_R2(struct lzx_lru_queue queue)
 {
 	return (queue.R >> LZX_QUEUE_R2_SHIFT) & LZX_QUEUE_OFFSET_MASK;
 }
 
 /* Push a match offset onto the front (most recently used) end of the queue.  */
-static inline struct lzx_lru_queue
+static forceinline struct lzx_lru_queue
 lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
 {
 	return (struct lzx_lru_queue) {
@@ -1326,7 +1326,7 @@ lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
 }
 
 /* Swap a match offset to the front of the queue.  */
-static inline struct lzx_lru_queue
+static forceinline struct lzx_lru_queue
 lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
 {
 	unsigned shift = idx * 21;
@@ -1340,7 +1340,7 @@ lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
 	};
 }
 
-static inline u32
+static forceinline u32
 lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
 		   bool record)
 {
@@ -1475,7 +1475,7 @@ lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
  * beginning of the block), but this doesn't matter because this function only
  * computes frequencies.
  */
-static inline void
+static forceinline void
 lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 {
 	lzx_walk_item_list(c, block_size, is_16_bit, false);
@@ -1490,7 +1490,7 @@ lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
  * first-to-last order.  The return value is the index in c->chosen_sequences at
  * which the lzx_sequences begin.
  */
-static inline u32
+static forceinline u32
 lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 {
 	return lzx_walk_item_list(c, block_size, is_16_bit, true);
@@ -1530,7 +1530,7 @@ lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
  * one step ahead, with the exception of special consideration for "gap
  * matches".
  */
-static inline struct lzx_lru_queue
+static forceinline struct lzx_lru_queue
 lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 		       const u8 * const restrict block_begin,
 		       const u32 block_size,
@@ -2095,7 +2095,7 @@ lzx_set_costs_from_codes(struct lzx_compressor *c)
  * for the block uses default costs; additional passes use costs derived from
  * the Huffman codes computed in the previous pass.
  */
-static inline struct lzx_lru_queue
+static forceinline struct lzx_lru_queue
 lzx_optimize_and_flush_block(struct lzx_compressor * const restrict c,
 			     struct lzx_output_bitstream * const restrict os,
 			     const u8 * const restrict block_begin,
@@ -2144,7 +2144,7 @@ lzx_optimize_and_flush_block(struct lzx_compressor * const restrict c,
  * time, but rather to produce a compression ratio significantly better than a
  * simpler "greedy" or "lazy" parse while still being relatively fast.
  */
-static inline void
+static forceinline void
 lzx_compress_near_optimal(struct lzx_compressor * restrict c,
 			  const u8 * const restrict in_begin, size_t in_nbytes,
 			  struct lzx_output_bitstream * restrict os,
@@ -2349,7 +2349,7 @@ lzx_compress_near_optimal_32(struct lzx_compressor *c, const u8 *in,
  * Huffman symbol for the literal, increments the current literal run length,
  * and "observes" the literal for the block split statistics.
  */
-static inline void
+static forceinline void
 lzx_choose_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
 {
 	lzx_observe_literal(&c->split_stats, literal);
@@ -2363,7 +2363,7 @@ lzx_choose_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
  * literal run, updates the recent offsets queue, and "observes" the match for
  * the block split statistics.
  */
-static inline void
+static forceinline void
 lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 adjusted_offset,
 		 u32 recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit,
 		 u32 *litrunlen_p, struct lzx_sequence **next_seq_p)
@@ -2425,7 +2425,7 @@ lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 adjusted_offset,
  * which is just a literal run with no following match.  This literal run might
  * be empty.
  */
-static inline void
+static forceinline void
 lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
 {
 	last_seq->litrunlen = litrunlen;
@@ -2492,7 +2492,7 @@ lzx_find_longest_repeat_offset_match(const u8 * const in_next,
  * offset matches, since those require fewer bits to encode.
  */
 
-static inline unsigned
+static forceinline unsigned
 lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
 {
 	unsigned score = len;
@@ -2505,7 +2505,7 @@ lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
 	return score;
 }
 
-static inline unsigned
+static forceinline unsigned
 lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
 {
 	return rep_len + 3;
@@ -2523,7 +2523,7 @@ lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
  * when we decide whether a match is "better" than another, we take the offset
  * into consideration as well as the length.
  */
-static inline void
+static forceinline void
 lzx_compress_lazy(struct lzx_compressor * restrict c,
 		  const u8 * const restrict in_begin, size_t in_nbytes,
 		  struct lzx_output_bitstream * restrict os, bool is_16_bit)
diff --git a/src/lzx_decompress.c b/src/lzx_decompress.c
index cce98e32..299b5409 100644
--- a/src/lzx_decompress.c
+++ b/src/lzx_decompress.c
@@ -118,7 +118,7 @@ struct lzx_decompressor {
 } _aligned_attribute(DECODE_TABLE_ALIGNMENT);
 
 /* Read a Huffman-encoded symbol using the precode. */
-static inline unsigned
+static forceinline unsigned
 read_presym(const struct lzx_decompressor *d, struct input_bitstream *is)
 {
 	return read_huffsym(is, d->precode_decode_table,
@@ -126,7 +126,7 @@ read_presym(const struct lzx_decompressor *d, struct input_bitstream *is)
 }
 
 /* Read a Huffman-encoded symbol using the main code. */
-static inline unsigned
+static forceinline unsigned
 read_mainsym(const struct lzx_decompressor *d, struct input_bitstream *is)
 {
 	return read_huffsym(is, d->maincode_decode_table,
@@ -134,7 +134,7 @@ read_mainsym(const struct lzx_decompressor *d, struct input_bitstream *is)
 }
 
 /* Read a Huffman-encoded symbol using the length code. */
-static inline unsigned
+static forceinline unsigned
 read_lensym(const struct lzx_decompressor *d, struct input_bitstream *is)
 {
 	return read_huffsym(is, d->lencode_decode_table,
@@ -142,7 +142,7 @@ read_lensym(const struct lzx_decompressor *d, struct input_bitstream *is)
 }
 
 /* Read a Huffman-encoded symbol using the aligned offset code. */
-static inline unsigned
+static forceinline unsigned
 read_alignedsym(const struct lzx_decompressor *d, struct input_bitstream *is)
 {
 	return read_huffsym(is, d->alignedcode_decode_table,
diff --git a/src/xpress_compress.c b/src/xpress_compress.c
index 99a4b46a..1b430912 100644
--- a/src/xpress_compress.c
+++ b/src/xpress_compress.c
@@ -279,7 +279,7 @@ xpress_init_output(struct xpress_output_bitstream *os, void *buffer, size_t size
  * If the output buffer space is exhausted, then the bits will be ignored, and
  * xpress_flush_output() will return 0 when it gets called.
  */
-static inline void
+static forceinline void
 xpress_write_bits(struct xpress_output_bitstream *os,
 		  const u32 bits, const unsigned num_bits)
 {
@@ -303,7 +303,7 @@ xpress_write_bits(struct xpress_output_bitstream *os,
 /*
  * Interweave a literal byte into the output bitstream.
  */
-static inline void
+static forceinline void
 xpress_write_byte(struct xpress_output_bitstream *os, u8 byte)
 {
 	if (os->next_byte < os->end)
@@ -313,7 +313,7 @@ xpress_write_byte(struct xpress_output_bitstream *os, u8 byte)
 /*
  * Interweave two literal bytes into the output bitstream.
  */
-static inline void
+static forceinline void
 xpress_write_u16(struct xpress_output_bitstream *os, u16 v)
 {
 	if (os->end - os->next_byte >= 2) {
@@ -338,7 +338,7 @@ xpress_flush_output(struct xpress_output_bitstream *os)
 	return os->next_byte - os->start;
 }
 
-static inline void
+static forceinline void
 xpress_write_extra_length_bytes(struct xpress_output_bitstream *os,
 				unsigned adjusted_len)
 {
@@ -353,7 +353,7 @@ xpress_write_extra_length_bytes(struct xpress_output_bitstream *os,
 }
 
 /* Output a match or literal.  */
-static inline void
+static forceinline void
 xpress_write_item(struct xpress_item item, struct xpress_output_bitstream *os,
 		  const u32 codewords[], const u8 lens[])
 {
@@ -484,7 +484,7 @@ xpress_write(struct xpress_compressor *c, void *out, size_t out_nbytes_avail,
 
 /* Tally the Huffman symbol for a literal and return the intermediate
  * representation of that literal.  */
-static inline struct xpress_item
+static forceinline struct xpress_item
 xpress_record_literal(struct xpress_compressor *c, unsigned literal)
 {
 	c->freqs[literal]++;
@@ -496,7 +496,7 @@ xpress_record_literal(struct xpress_compressor *c, unsigned literal)
 
 /* Tally the Huffman symbol for a match and return the intermediate
  * representation of that match.  */
-static inline struct xpress_item
+static forceinline struct xpress_item
 xpress_record_match(struct xpress_compressor *c, unsigned length, unsigned offset)
 {
 	unsigned adjusted_len = length - XPRESS_MIN_MATCH_LEN;