X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Flzx_compress.c;h=de8430145f4e7a2867ad60c59b5d17a6b5a4436e;hp=8a14f3c84a60d0b86d9fed9b08ae1b3f6273c377;hb=26396c2c45946ba38c18c6ac0207e8c1f68e4668;hpb=e402076a7ef7b3d3f7d9e0d3eecd2eebc61516f1

diff --git a/src/lzx_compress.c b/src/lzx_compress.c
index 8a14f3c8..de843014 100644
--- a/src/lzx_compress.c
+++ b/src/lzx_compress.c
@@ -126,32 +126,46 @@
 #define LZX_MAX_FAST_LEVEL	34
 
 /*
- * LZX_HASH2_ORDER is the log base 2 of the number of entries in the hash table
- * for finding length 2 matches.  This can be as high as 16 (in which case the
- * hash function is trivial), but using a smaller hash table speeds up
- * compression due to reduced cache pressure.
+ * BT_MATCHFINDER_HASH2_ORDER is the log base 2 of the number of entries in the
+ * hash table for finding length 2 matches.  This could be as high as 16, but
+ * using a smaller hash table speeds up compression due to reduced cache
+ * pressure.
  */
-#define LZX_HASH2_ORDER		12
-#define LZX_HASH2_LENGTH	(1UL << LZX_HASH2_ORDER)
-
-#include "wimlib/lzx_common.h"
+#define BT_MATCHFINDER_HASH2_ORDER	12
 
 /*
- * The maximum allowed window order for the matchfinder.
+ * These are the compressor-side limits on the codeword lengths for each Huffman
+ * code.  To make outputting bits slightly faster, some of these limits are
+ * lower than the limits defined by the LZX format.  This does not significantly
+ * affect the compression ratio, at least for the block sizes we use.
  */
-#define MATCHFINDER_MAX_WINDOW_ORDER	LZX_MAX_WINDOW_ORDER
-
-#include <string.h>
+#define MAIN_CODEWORD_LIMIT	12	/* 64-bit: can buffer 4 main symbols  */
+#define LENGTH_CODEWORD_LIMIT	12
+#define ALIGNED_CODEWORD_LIMIT	7
+#define PRE_CODEWORD_LIMIT	7
 
-#include "wimlib/bt_matchfinder.h"
 #include "wimlib/compress_common.h"
 #include "wimlib/compressor_ops.h"
 #include "wimlib/error.h"
-#include "wimlib/hc_matchfinder.h"
 #include "wimlib/lz_extend.h"
+#include "wimlib/lzx_common.h"
 #include "wimlib/unaligned.h"
 #include "wimlib/util.h"
 
+/* Matchfinders with 16-bit positions  */
+#define pos_t	u16
+#define MF_SUFFIX _16
+#include "wimlib/bt_matchfinder.h"
+#include "wimlib/hc_matchfinder.h"
+
+/* Matchfinders with 32-bit positions  */
+#undef pos_t
+#undef MF_SUFFIX
+#define pos_t	u32
+#define MF_SUFFIX _32
+#include "wimlib/bt_matchfinder.h"
+#include "wimlib/hc_matchfinder.h"
+
 struct lzx_output_bitstream;
 
 /* Codewords for the LZX Huffman codes.  */
@@ -412,7 +426,10 @@ struct lzx_compressor {
 		/* Data for greedy or lazy parsing  */
 		struct {
 			/* Hash chains matchfinder (MUST BE LAST!!!)  */
-			struct hc_matchfinder hc_mf;
+			union {
+				struct hc_matchfinder_16 hc_mf_16;
+				struct hc_matchfinder_32 hc_mf_32;
+			};
 		};
 
 		/* Data for near-optimal parsing  */
@@ -467,15 +484,43 @@ struct lzx_compressor {
 						    LZX_MAX_MATCHES_PER_POS +
 						    LZX_MAX_MATCH_LEN - 1];
 
-			/* Hash table for finding length 2 matches  */
-			pos_t hash2_tab[LZX_HASH2_LENGTH];
-
 			/* Binary trees matchfinder (MUST BE LAST!!!)  */
-			struct bt_matchfinder bt_mf;
+			union {
+				struct bt_matchfinder_16 bt_mf_16;
+				struct bt_matchfinder_32 bt_mf_32;
+			};
 		};
 	};
 };
 
+/*
+ * Will a matchfinder using 16-bit positions be sufficient for compressing
+ * buffers of up to the specified size?  The limit could be 65536 bytes, but we
+ * also want to optimize out the use of offset_slot_tab_2 in the 16-bit case.
+ * This requires that the limit be no more than the length of offset_slot_tab_1
+ * (currently 32768).
+ */
+static inline bool
+lzx_is_16_bit(size_t max_bufsize)
+{
+	STATIC_ASSERT(ARRAY_LEN(((struct lzx_compressor *)0)->offset_slot_tab_1) == 32768);
+	return max_bufsize <= 32768;
+}
+
+/*
+ * The following macros call either the 16-bit or the 32-bit version of a
+ * matchfinder function based on the value of 'is_16_bit', which will be known
+ * at compilation time.
+ */
+
+#define CALL_HC_MF(is_16_bit, c, funcname, ...)				      \
+	((is_16_bit) ? CONCAT(funcname, _16)(&(c)->hc_mf_16, ##__VA_ARGS__) : \
+		       CONCAT(funcname, _32)(&(c)->hc_mf_32, ##__VA_ARGS__));
+
+#define CALL_BT_MF(is_16_bit, c, funcname, ...)				      \
+	((is_16_bit) ? CONCAT(funcname, _16)(&(c)->bt_mf_16, ##__VA_ARGS__) : \
+		       CONCAT(funcname, _32)(&(c)->bt_mf_32, ##__VA_ARGS__));
+
 /*
  * Structure to keep track of the current state of sending bits to the
  * compressed output buffer.
@@ -485,7 +530,7 @@ struct lzx_compressor {
 struct lzx_output_bitstream {
 
 	/* Bits that haven't yet been written to the output buffer.  */
-	u32 bitbuf;
+	machine_word_t bitbuf;
 
 	/* Number of bits currently held in @bitbuf.  */
 	u32 bitcount;
@@ -502,6 +547,10 @@ struct lzx_output_bitstream {
 	u8 *end;
 };
 
+/* Can the specified number of bits always be added to 'bitbuf' after any
+ * pending 16-bit coding units have been flushed?  */
+#define CAN_BUFFER(n)	((n) <= (8 * sizeof(machine_word_t)) - 16)
+
 /*
  * Initialize the output bitstream.
  *
@@ -522,68 +571,38 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
 	os->end = os->start + (size & ~1);
 }
 
-/*
- * Write some bits to the output bitstream.
- *
- * The bits are given by the low-order @num_bits bits of @bits.  Higher-order
- * bits in @bits cannot be set.  At most 17 bits can be written at once.
- *
- * @max_num_bits is a compile-time constant that specifies the maximum number of
- * bits that can ever be written at the call site.  It is used to optimize away
- * the conditional code for writing a second 16-bit coding unit when writing
- * fewer than 17 bits.
- *
- * If the output buffer space is exhausted, then the bits will be ignored, and
- * lzx_flush_output() will return 0 when it gets called.
- */
+/* Add some bits to the bitbuffer variable of the output bitstream.  The caller
+ * must make sure there is enough room.  */
 static inline void
-lzx_write_varbits(struct lzx_output_bitstream *os,
-		  const u32 bits, const unsigned num_bits,
-		  const unsigned max_num_bits)
+lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 {
-	/* This code is optimized for LZX, which never needs to write more than
-	 * 17 bits at once.  */
-	LZX_ASSERT(num_bits <= 17);
-	LZX_ASSERT(num_bits <= max_num_bits);
-	LZX_ASSERT(os->bitcount <= 15);
-
-	/* Add the bits to the bit buffer variable.  @bitcount will be at most
-	 * 15, so there will be just enough space for the maximum possible
-	 * @num_bits of 17.  */
-	os->bitcount += num_bits;
 	os->bitbuf = (os->bitbuf << num_bits) | bits;
+	os->bitcount += num_bits;
+}
 
-	/* Check whether any coding units need to be written.  */
-	if (os->bitcount >= 16) {
-
-		os->bitcount -= 16;
-
-		/* Write a coding unit, unless it would overflow the buffer.  */
-		if (os->next != os->end) {
-			put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next);
-			os->next += 2;
-		}
-
-		/* If writing 17 bits, a second coding unit might need to be
-		 * written.  But because 'max_num_bits' is a compile-time
-		 * constant, the compiler will optimize away this code at most
-		 * call sites.  */
-		if (max_num_bits == 17 && os->bitcount == 16) {
-			if (os->next != os->end) {
-				put_unaligned_u16_le(os->bitbuf, os->next);
-				os->next += 2;
-			}
-			os->bitcount = 0;
-		}
-	}
+/* Flush bits from the bitbuffer variable to the output buffer.  'max_num_bits'
+ * specifies the maximum number of bits that may have been added since the last
+ * flush.  */
+static inline void
+lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
+{
+	if (os->end - os->next < 6)
+		return;
+	put_unaligned_u16_le(os->bitbuf >> (os->bitcount - 16), os->next + 0);
+	if (max_num_bits > 16)
+		put_unaligned_u16_le(os->bitbuf >> (os->bitcount - 32), os->next + 2);
+	if (max_num_bits > 32)
+		put_unaligned_u16_le(os->bitbuf >> (os->bitcount - 48), os->next + 4);
+	os->next += (os->bitcount >> 4) << 1;
+	os->bitcount &= 15;
 }
 
-/* Use when @num_bits is a compile-time constant.  Otherwise use
- * lzx_write_varbits().  */
+/* Add at most 16 bits to the bitbuffer and flush it.  */
 static inline void
 lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 {
-	lzx_write_varbits(os, bits, num_bits, num_bits);
+	lzx_add_bits(os, bits, num_bits);
+	lzx_flush_bits(os, 16);
 }
 
 /*
@@ -593,7 +612,7 @@ lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 static u32
 lzx_flush_output(struct lzx_output_bitstream *os)
 {
-	if (os->next == os->end)
+	if (os->end - os->next < 6)
 		return 0;
 
 	if (os->bitcount != 0) {
@@ -614,20 +633,27 @@ lzx_make_huffman_codes(struct lzx_compressor *c)
 	const struct lzx_freqs *freqs = &c->freqs;
 	struct lzx_codes *codes = &c->codes[c->codes_index];
 
+	STATIC_ASSERT(MAIN_CODEWORD_LIMIT >= 9 &&
+		      MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN);
+	STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 9 &&
+		      LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN);
+	STATIC_ASSERT(ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS &&
+		      ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN);
+
 	make_canonical_huffman_code(c->num_main_syms,
-				    LZX_MAX_MAIN_CODEWORD_LEN,
+				    MAIN_CODEWORD_LIMIT,
 				    freqs->main,
 				    codes->lens.main,
 				    codes->codewords.main);
 
 	make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
-				    LZX_MAX_LEN_CODEWORD_LEN,
+				    LENGTH_CODEWORD_LIMIT,
 				    freqs->len,
 				    codes->lens.len,
 				    codes->codewords.len);
 
 	make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
-				    LZX_MAX_ALIGNED_CODEWORD_LEN,
+				    ALIGNED_CODEWORD_LIMIT,
 				    freqs->aligned,
 				    codes->lens.aligned,
 				    codes->codewords.aligned);
@@ -786,8 +812,10 @@ lzx_write_compressed_code(struct lzx_output_bitstream *os,
 						      precode_items);
 
 	/* Build the precode.  */
+	STATIC_ASSERT(PRE_CODEWORD_LIMIT >= 5 &&
+		      PRE_CODEWORD_LIMIT <= LZX_MAX_PRE_CODEWORD_LEN);
 	make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS,
-				    LZX_MAX_PRE_CODEWORD_LEN,
+				    PRE_CODEWORD_LIMIT,
 				    precode_freqs, precode_lens,
 				    precode_codewords);
 
@@ -799,22 +827,22 @@ lzx_write_compressed_code(struct lzx_output_bitstream *os,
 	for (i = 0; i < num_precode_items; i++) {
 		precode_item = precode_items[i];
 		precode_sym = precode_item & 0x1F;
-		lzx_write_varbits(os, precode_codewords[precode_sym],
-				  precode_lens[precode_sym],
-				  LZX_MAX_PRE_CODEWORD_LEN);
+		lzx_add_bits(os, precode_codewords[precode_sym],
+			     precode_lens[precode_sym]);
 		if (precode_sym >= 17) {
 			if (precode_sym == 17) {
-				lzx_write_bits(os, precode_item >> 5, 4);
+				lzx_add_bits(os, precode_item >> 5, 4);
 			} else if (precode_sym == 18) {
-				lzx_write_bits(os, precode_item >> 5, 5);
+				lzx_add_bits(os, precode_item >> 5, 5);
 			} else {
-				lzx_write_bits(os, (precode_item >> 5) & 1, 1);
+				lzx_add_bits(os, (precode_item >> 5) & 1, 1);
 				precode_sym = precode_item >> 6;
-				lzx_write_varbits(os, precode_codewords[precode_sym],
-						  precode_lens[precode_sym],
-						  LZX_MAX_PRE_CODEWORD_LEN);
+				lzx_add_bits(os, precode_codewords[precode_sym],
+					     precode_lens[precode_sym]);
 			}
 		}
+		STATIC_ASSERT(CAN_BUFFER(2 * PRE_CODEWORD_LIMIT + 1));
+		lzx_flush_bits(os, 2 * PRE_CODEWORD_LIMIT + 1);
 	}
 
 	*(u8 *)(lens + num_lens) = saved;
@@ -860,13 +888,53 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 
 		/* Output the literal run of the sequence.  */
 
-		if (litrunlen) {
-			do {
-				unsigned lit = *block_data++;
-				lzx_write_varbits(os, codes->codewords.main[lit],
-						  codes->lens.main[lit],
-						  LZX_MAX_MAIN_CODEWORD_LEN);
-			} while (--litrunlen);
+		if (litrunlen) {  /* Is the literal run nonempty?  */
+
+			/* Verify optimization is enabled on 64-bit  */
+			STATIC_ASSERT(sizeof(machine_word_t) < 8 ||
+				      CAN_BUFFER(4 * MAIN_CODEWORD_LIMIT));
+
+			if (CAN_BUFFER(4 * MAIN_CODEWORD_LIMIT)) {
+
+				/* 64-bit: write 4 literals at a time.  */
+				while (litrunlen >= 4) {
+					unsigned lit0 = block_data[0];
+					unsigned lit1 = block_data[1];
+					unsigned lit2 = block_data[2];
+					unsigned lit3 = block_data[3];
+					lzx_add_bits(os, codes->codewords.main[lit0], codes->lens.main[lit0]);
+					lzx_add_bits(os, codes->codewords.main[lit1], codes->lens.main[lit1]);
+					lzx_add_bits(os, codes->codewords.main[lit2], codes->lens.main[lit2]);
+					lzx_add_bits(os, codes->codewords.main[lit3], codes->lens.main[lit3]);
+					lzx_flush_bits(os, 4 * MAIN_CODEWORD_LIMIT);
+					block_data += 4;
+					litrunlen -= 4;
+				}
+				if (litrunlen--) {
+					unsigned lit = *block_data++;
+					lzx_add_bits(os, codes->codewords.main[lit], codes->lens.main[lit]);
+					if (litrunlen--) {
+						unsigned lit = *block_data++;
+						lzx_add_bits(os, codes->codewords.main[lit], codes->lens.main[lit]);
+						if (litrunlen--) {
+							unsigned lit = *block_data++;
+							lzx_add_bits(os, codes->codewords.main[lit], codes->lens.main[lit]);
+							lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
+						} else {
+							lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
+						}
+					} else {
+						lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT);
+					}
+				}
+			} else {
+				/* 32-bit: write 1 literal at a time.  */
+				do {
+					unsigned lit = *block_data++;
+					lzx_add_bits(os, codes->codewords.main[lit], codes->lens.main[lit]);
+					lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
+				} while (--litrunlen);
+			}
 		}
 
 		/* Was this the last literal run?  */
@@ -887,17 +955,26 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 		num_extra_bits = lzx_extra_offset_bits[offset_slot];
 		extra_bits = adjusted_offset - lzx_offset_slot_base[offset_slot];
 
+	#define MAX_MATCH_BITS	(MAIN_CODEWORD_LIMIT + LENGTH_CODEWORD_LIMIT + \
+				 14 + ALIGNED_CODEWORD_LIMIT)
+
+		/* Verify optimization is enabled on 64-bit  */
+		STATIC_ASSERT(sizeof(machine_word_t) < 8 || CAN_BUFFER(MAX_MATCH_BITS));
+
 		/* Output the main symbol for the match.  */
-		lzx_write_varbits(os, codes->codewords.main[main_symbol],
-				  codes->lens.main[main_symbol],
-				  LZX_MAX_MAIN_CODEWORD_LEN);
+
+		lzx_add_bits(os, codes->codewords.main[main_symbol],
+			     codes->lens.main[main_symbol]);
+		if (!CAN_BUFFER(MAX_MATCH_BITS))
+			lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
 
 		/* If needed, output the length symbol for the match.  */
 
 		if (adjusted_length >= LZX_NUM_PRIMARY_LENS) {
-			lzx_write_varbits(os, codes->codewords.len[adjusted_length - LZX_NUM_PRIMARY_LENS],
-					  codes->lens.len[adjusted_length - LZX_NUM_PRIMARY_LENS],
-					  LZX_MAX_LEN_CODEWORD_LEN);
+			lzx_add_bits(os, codes->codewords.len[adjusted_length - LZX_NUM_PRIMARY_LENS],
+				     codes->lens.len[adjusted_length - LZX_NUM_PRIMARY_LENS]);
+			if (!CAN_BUFFER(MAX_MATCH_BITS))
+				lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
 		}
 
 		/* Output the extra offset bits for the match.  In aligned
@@ -908,17 +985,24 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 
 		if ((adjusted_offset & ones_if_aligned) >= 16) {
 
-			lzx_write_varbits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS,
-					  num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS,
-					  14);
+			lzx_add_bits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS,
+				     num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS);
+			if (!CAN_BUFFER(MAX_MATCH_BITS))
+				lzx_flush_bits(os, 14);
 
-			lzx_write_varbits(os, codes->codewords.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK],
-					  codes->lens.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK],
-					  LZX_MAX_ALIGNED_CODEWORD_LEN);
+			lzx_add_bits(os, codes->codewords.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK],
+				     codes->lens.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]);
+			if (!CAN_BUFFER(MAX_MATCH_BITS))
+				lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
 		} else {
-			lzx_write_varbits(os, extra_bits, num_extra_bits, 17);
+			lzx_add_bits(os, extra_bits, num_extra_bits);
+			if (!CAN_BUFFER(MAX_MATCH_BITS))
+				lzx_flush_bits(os, 17);
 		}
 
+		if (CAN_BUFFER(MAX_MATCH_BITS))
+			lzx_flush_bits(os, MAX_MATCH_BITS);
+
 		/* Advance to the next sequence.  */
 		seq++;
 	}
@@ -935,9 +1019,6 @@ lzx_write_compressed_block(const u8 *block_begin,
 			   const struct lzx_lens * prev_lens,
 			   struct lzx_output_bitstream * os)
 {
-	LZX_ASSERT(block_type == LZX_BLOCKTYPE_ALIGNED ||
-		   block_type == LZX_BLOCKTYPE_VERBATIM);
-
 	/* The first three bits indicate the type of block and are one of the
 	 * LZX_BLOCKTYPE_* constants.  */
 	lzx_write_bits(os, block_type, 3);
@@ -1025,9 +1106,10 @@ lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
  * compressor's acceleration tables to speed up the mapping.
  */
 static inline unsigned
-lzx_comp_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset)
+lzx_comp_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset,
+			 bool is_16_bit)
 {
-	if (adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1))
+	if (is_16_bit || adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1))
 		return c->offset_slot_tab_1[adjusted_offset];
 	return c->offset_slot_tab_2[adjusted_offset >> 14];
 }
@@ -1076,7 +1158,7 @@ lzx_record_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
  * offsets queue.  */
 static inline void
 lzx_record_match(struct lzx_compressor *c, unsigned length, u32 offset_data,
-		 u32 recent_offsets[LZX_NUM_RECENT_OFFSETS],
+		 u32 recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit,
 		 u32 *litrunlen_p, struct lzx_sequence **next_seq_p)
 {
 	u32 litrunlen = *litrunlen_p;
@@ -1097,7 +1179,7 @@ lzx_record_match(struct lzx_compressor *c, unsigned length, u32 offset_data,
 	}
 
 	/* Compute the offset slot  */
-	offset_slot = lzx_comp_get_offset_slot(c, offset_data);
+	offset_slot = lzx_comp_get_offset_slot(c, offset_data, is_16_bit);
 
 	/* Compute the match header.  */
 	v += offset_slot * LZX_NUM_LEN_HEADERS;
@@ -1150,8 +1232,8 @@ lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
  * beginning of the block), but this doesn't matter because this function only
  * computes frequencies.
  */
-static void
-lzx_tally_item_list(struct lzx_compressor *c, u32 block_size)
+static inline void
+lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 {
 	u32 node_idx = block_size;
 	for (;;) {
@@ -1194,7 +1276,7 @@ lzx_tally_item_list(struct lzx_compressor *c, u32 block_size)
 		}
 
 		/* Tally the main symbol.  */
-		offset_slot = lzx_comp_get_offset_slot(c, offset_data);
+		offset_slot = lzx_comp_get_offset_slot(c, offset_data, is_16_bit);
 		v += offset_slot * LZX_NUM_LEN_HEADERS;
 		c->freqs.main[LZX_NUM_CHARS + v]++;
 
@@ -1212,8 +1294,8 @@ lzx_tally_item_list(struct lzx_compressor *c, u32 block_size)
  * first-to-last order.  The return value is the index in c->chosen_sequences at
  * which the lzx_sequences begin.
  */
-static u32
-lzx_record_item_list(struct lzx_compressor *c, u32 block_size)
+static inline u32
+lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 {
 	u32 node_idx = block_size;
 	u32 seq_idx = ARRAY_LEN(c->chosen_sequences) - 1;
@@ -1270,7 +1352,7 @@ lzx_record_item_list(struct lzx_compressor *c, u32 block_size)
 		}
 
 		/* Tally the main symbol.  */
-		offset_slot = lzx_comp_get_offset_slot(c, offset_data);
+		offset_slot = lzx_comp_get_offset_slot(c, offset_data, is_16_bit);
 		v += offset_slot * LZX_NUM_LEN_HEADERS;
 		c->freqs.main[LZX_NUM_CHARS + v]++;
 
@@ -1322,11 +1404,12 @@ out:
  * later.  The algorithm does not solve this problem; it only considers the
  * lowest cost to reach each individual position.
  */
-static struct lzx_lru_queue
+static inline struct lzx_lru_queue
 lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 		       const u8 * const restrict block_begin,
 		       const u32 block_size,
-		       const struct lzx_lru_queue initial_queue)
+		       const struct lzx_lru_queue initial_queue,
+		       bool is_16_bit)
 {
 	struct lzx_optimum_node *cur_node = c->optimum_nodes;
 	struct lzx_optimum_node * const end_node = &c->optimum_nodes[block_size];
@@ -1472,7 +1555,8 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 			do {
 				u32 offset = cache_ptr->offset;
 				u32 offset_data = offset + LZX_OFFSET_ADJUSTMENT;
-				unsigned offset_slot = lzx_comp_get_offset_slot(c, offset_data);
+				unsigned offset_slot = lzx_comp_get_offset_slot(c, offset_data,
+										is_16_bit);
 				do {
 					u32 cost = cur_node->cost +
 						   c->costs.match_cost[offset_slot][
@@ -1639,12 +1723,13 @@ lzx_update_costs(struct lzx_compressor *c)
 	lzx_compute_match_costs(c);
 }
 
-static struct lzx_lru_queue
+static inline struct lzx_lru_queue
 lzx_optimize_and_write_block(struct lzx_compressor * const restrict c,
 			     struct lzx_output_bitstream * const restrict os,
 			     const u8 * const restrict block_begin,
 			     const u32 block_size,
-			     const struct lzx_lru_queue initial_queue)
+			     const struct lzx_lru_queue initial_queue,
+			     bool is_16_bit)
 {
 	unsigned num_passes_remaining = c->num_optim_passes;
 	struct lzx_lru_queue new_queue;
@@ -1658,16 +1743,16 @@ lzx_optimize_and_write_block(struct lzx_compressor * const restrict c,
 	lzx_reset_symbol_frequencies(c);
 	do {
 		new_queue = lzx_find_min_cost_path(c, block_begin, block_size,
-						   initial_queue);
+						   initial_queue, is_16_bit);
 		if (num_passes_remaining > 1) {
-			lzx_tally_item_list(c, block_size);
+			lzx_tally_item_list(c, block_size, is_16_bit);
 			lzx_make_huffman_codes(c);
 			lzx_update_costs(c);
 			lzx_reset_symbol_frequencies(c);
 		}
 	} while (--num_passes_remaining);
 
-	seq_idx = lzx_record_item_list(c, block_size);
+	seq_idx = lzx_record_item_list(c, block_size, is_16_bit);
 	lzx_finish_block(c, os, block_begin, block_size, seq_idx);
 	return new_queue;
 }
@@ -1685,21 +1770,20 @@ lzx_optimize_and_write_block(struct lzx_compressor * const restrict c,
  * time, but rather to produce a compression ratio significantly better than a
  * simpler "greedy" or "lazy" parse while still being relatively fast.
  */
-static void
+static inline void
 lzx_compress_near_optimal(struct lzx_compressor *c,
-			  struct lzx_output_bitstream *os)
+			  struct lzx_output_bitstream *os,
+			  bool is_16_bit)
 {
 	const u8 * const in_begin = c->in_buffer;
 	const u8 *	 in_next = in_begin;
 	const u8 * const in_end  = in_begin + c->in_nbytes;
 	unsigned max_len = LZX_MAX_MATCH_LEN;
 	unsigned nice_len = min(c->nice_match_length, max_len);
-	u32 next_hash;
+	u32 next_hash = 0;
 	struct lzx_lru_queue queue;
 
-	bt_matchfinder_init(&c->bt_mf);
-	memset(c->hash2_tab, 0, sizeof(c->hash2_tab));
-	next_hash = bt_matchfinder_hash_3_bytes(in_next);
+	CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
 	lzx_lru_queue_init(&queue);
 
 	do {
@@ -1712,8 +1796,6 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 		struct lz_match *cache_ptr = c->match_cache;
 		do {
 			struct lz_match *lz_matchptr;
-			u32 hash2;
-			pos_t cur_match;
 			unsigned best_len;
 
 			/* If approaching the end of the input buffer, adjust
@@ -1735,33 +1817,16 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 				}
 			}
 
-			lz_matchptr = cache_ptr + 1;
-
-			/* Check for a length 2 match.  */
-			hash2 = lz_hash_2_bytes(in_next, LZX_HASH2_ORDER);
-			cur_match = c->hash2_tab[hash2];
-			c->hash2_tab[hash2] = in_next - in_begin;
-			if (cur_match != 0 &&
-			    (LZX_HASH2_ORDER == 16 ||
-			     load_u16_unaligned(&in_begin[cur_match]) ==
-			     load_u16_unaligned(in_next)))
-			{
-				lz_matchptr->length = 2;
-				lz_matchptr->offset = in_next - &in_begin[cur_match];
-				lz_matchptr++;
-			}
-
-			/* Check for matches of length >= 3.  */
-			lz_matchptr = bt_matchfinder_get_matches(&c->bt_mf,
-								 in_begin,
-								 in_next,
-								 3,
-								 max_len,
-								 nice_len,
-								 c->max_search_depth,
-								 &next_hash,
-								 &best_len,
-								 lz_matchptr);
+			/* Check for matches.  */
+			lz_matchptr = CALL_BT_MF(is_16_bit, c, bt_matchfinder_get_matches,
+						 in_begin,
+						 in_next - in_begin,
+						 max_len,
+						 nice_len,
+						 c->max_search_depth,
+						 &next_hash,
+						 &best_len,
+						 cache_ptr + 1);
 			in_next++;
 			cache_ptr->length = lz_matchptr - (cache_ptr + 1);
 			cache_ptr = lz_matchptr;
@@ -1791,15 +1856,13 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 							continue;
 						}
 					}
-					c->hash2_tab[lz_hash_2_bytes(in_next, LZX_HASH2_ORDER)] =
-						in_next - in_begin;
-					bt_matchfinder_skip_position(&c->bt_mf,
-								     in_begin,
-								     in_next,
-								     in_end,
-								     nice_len,
-								     c->max_search_depth,
-								     &next_hash);
+					CALL_BT_MF(is_16_bit, c, bt_matchfinder_skip_position,
+						   in_begin,
+						   in_next - in_begin,
+						   max_len,
+						   nice_len,
+						   c->max_search_depth,
+						   &next_hash);
 					in_next++;
 					cache_ptr->length = 0;
 					cache_ptr++;
@@ -1813,10 +1876,24 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 
 		queue = lzx_optimize_and_write_block(c, os, in_block_begin,
 						     in_next - in_block_begin,
-						     queue);
+						     queue, is_16_bit);
 	} while (in_next != in_end);
 }
 
+static void
+lzx_compress_near_optimal_16(struct lzx_compressor *c,
+			     struct lzx_output_bitstream *os)
+{
+	lzx_compress_near_optimal(c, os, true);
+}
+
+static void
+lzx_compress_near_optimal_32(struct lzx_compressor *c,
+			     struct lzx_output_bitstream *os)
+{
+	lzx_compress_near_optimal(c, os, false);
+}
+
 /*
  * Given a pointer to the current byte sequence and the current list of recent
  * match offsets, find the longest repeat offset match.
@@ -1833,7 +1910,6 @@ lzx_find_longest_repeat_offset_match(const u8 * const in_next,
 				     unsigned *rep_max_idx_ret)
 {
 	STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
-	LZX_ASSERT(bytes_remaining >= 2);
 
 	const unsigned max_len = min(bytes_remaining, LZX_MAX_MATCH_LEN);
 	const u16 next_2_bytes = load_u16_unaligned(in_next);
@@ -1893,8 +1969,9 @@ lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
 }
 
 /* This is the "lazy" LZX compressor.  */
-static void
-lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os)
+static inline void
+lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
+		  bool is_16_bit)
 {
 	const u8 * const in_begin = c->in_buffer;
 	const u8 *	 in_next = in_begin;
@@ -1905,7 +1982,7 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os)
 	u32 recent_offsets[3] = {1, 1, 1};
 	u32 next_hashes[2] = {};
 
-	hc_matchfinder_init(&c->hc_mf);
+	CALL_HC_MF(is_16_bit, c, hc_matchfinder_init);
 
 	do {
 		/* Starting a new block  */
@@ -1938,15 +2015,15 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os)
 
 			/* Find the longest match at the current position.  */
 
-			cur_len = hc_matchfinder_longest_match(&c->hc_mf,
-							       in_begin,
-							       in_next - in_begin,
-							       2,
-							       max_len,
-							       nice_len,
-							       c->max_search_depth,
-							       next_hashes,
-							       &cur_offset);
+			cur_len = CALL_HC_MF(is_16_bit, c, hc_matchfinder_longest_match,
+					     in_begin,
+					     in_next - in_begin,
+					     2,
+					     max_len,
+					     nice_len,
+					     c->max_search_depth,
+					     next_hashes,
+					     &cur_offset);
 			if (cur_len < 3 ||
 			    (cur_len == 3 &&
 			     cur_offset >= 8192 - LZX_OFFSET_ADJUSTMENT &&
@@ -2004,15 +2081,15 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os)
 				nice_len = min(max_len, nice_len);
 			}
 
-			next_len = hc_matchfinder_longest_match(&c->hc_mf,
-								in_begin,
-								in_next - in_begin,
-								cur_len - 2,
-								max_len,
-								nice_len,
-								c->max_search_depth / 2,
-								next_hashes,
-								&next_offset);
+			next_len = CALL_HC_MF(is_16_bit, c, hc_matchfinder_longest_match,
+					      in_begin,
+					      in_next - in_begin,
+					      cur_len - 2,
+					      max_len,
+					      nice_len,
+					      c->max_search_depth / 2,
+					      next_hashes,
+					      &next_offset);
 
 			if (next_len <= cur_len - 2) {
 				in_next++;
@@ -2062,13 +2139,14 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os)
 
 		choose_cur_match:
 			lzx_record_match(c, cur_len, cur_offset_data,
-					 recent_offsets, &litrunlen, &next_seq);
-			in_next = hc_matchfinder_skip_positions(&c->hc_mf,
-								in_begin,
-								in_next - in_begin,
-								in_end - in_begin,
-								skip_len,
-								next_hashes);
+					 recent_offsets, is_16_bit,
+					 &litrunlen, &next_seq);
+			in_next = CALL_HC_MF(is_16_bit, c, hc_matchfinder_skip_positions,
+					     in_begin,
+					     in_next - in_begin,
+					     in_end - in_begin,
+					     skip_len,
+					     next_hashes);
 		} while (in_next < in_block_end);
 
 		lzx_finish_sequence(next_seq, litrunlen);
@@ -2078,6 +2156,18 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os)
 	} while (in_next != in_end);
 }
 
+static void
+lzx_compress_lazy_16(struct lzx_compressor *c, struct lzx_output_bitstream *os)
+{
+	lzx_compress_lazy(c, os, true);
+}
+
+static void
+lzx_compress_lazy_32(struct lzx_compressor *c, struct lzx_output_bitstream *os)
+{
+	lzx_compress_lazy(c, os, false);
+}
+
 /* Generate the acceleration tables for offset slots.  */
 static void
 lzx_init_offset_slot_tabs(struct lzx_compressor *c)
@@ -2108,11 +2198,19 @@ static size_t
 lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level)
 {
 	if (compression_level <= LZX_MAX_FAST_LEVEL) {
-		return offsetof(struct lzx_compressor, hc_mf) +
-			hc_matchfinder_size(max_bufsize);
+		if (lzx_is_16_bit(max_bufsize))
+			return offsetof(struct lzx_compressor, hc_mf_16) +
+			       hc_matchfinder_size_16(max_bufsize);
+		else
+			return offsetof(struct lzx_compressor, hc_mf_32) +
+			       hc_matchfinder_size_32(max_bufsize);
 	} else {
-		return offsetof(struct lzx_compressor, bt_mf) +
-			bt_matchfinder_size(max_bufsize);
+		if (lzx_is_16_bit(max_bufsize))
+			return offsetof(struct lzx_compressor, bt_mf_16) +
+			       bt_matchfinder_size_16(max_bufsize);
+		else
+			return offsetof(struct lzx_compressor, bt_mf_32) +
+			       bt_matchfinder_size_32(max_bufsize);
 	}
 }
 
@@ -2161,7 +2259,10 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
 
 		/* Fast compression: Use lazy parsing.  */
 
-		c->impl = lzx_compress_lazy;
+		if (lzx_is_16_bit(max_bufsize))
+			c->impl = lzx_compress_lazy_16;
+		else
+			c->impl = lzx_compress_lazy_32;
 		c->max_search_depth = (36 * compression_level) / 20;
 		c->nice_match_length = (72 * compression_level) / 20;
 
@@ -2174,7 +2275,10 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
 
 		/* Normal / high compression: Use near-optimal parsing.  */
 
-		c->impl = lzx_compress_near_optimal;
+		if (lzx_is_16_bit(max_bufsize))
+			c->impl = lzx_compress_near_optimal_16;
+		else
+			c->impl = lzx_compress_near_optimal_32;
 
 		/* Scale nice_match_length and max_search_depth with the
 		 * compression level.  */