X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Flzms_common.c;h=57c17d4d3a5a1ba9bf3b5acff6edcbfeb0a0cb4b;hp=117e2f3222198a860464ec668ab16777632dbd01;hb=76689b1cac26c545260568997ae7fb949846f302;hpb=d1284a3b721162794ebd7131d090ab7c0cba92a3

diff --git a/src/lzms_common.c b/src/lzms_common.c
index 117e2f32..57c17d4d 100644
--- a/src/lzms_common.c
+++ b/src/lzms_common.c
@@ -23,9 +23,13 @@
 #  include "config.h"
 #endif
 
-#include "wimlib/endianness.h"
 #include "wimlib/lzms_common.h"
 #include "wimlib/unaligned.h"
+#include "wimlib/x86_cpu_features.h"
+
+#ifdef __x86_64__
+#  include <emmintrin.h>
+#endif
 
 /* Table: offset slot => offset slot base value  */
 const u32 lzms_offset_slot_base[LZMS_MAX_NUM_OFFSET_SYMS + 1] = {
@@ -303,8 +307,8 @@ const u32 lzms_length_slot_base[LZMS_NUM_LENGTH_SYMS + 1] = {
 	0x0000009b, 0x000000ab, 0x000000cb, 0x000000eb,
 	0x0000012b, 0x000001ab, 0x000002ab, 0x000004ab,
 	0x000008ab, 0x000108ab, 0x400108ab,
-	/* The last entry is extra; it is equal to LZMS_MAX_MATCH_LEN + 1 and is
-	 * here to aid binary search.  */
+	/* The last entry is extra; it is equal to LZMS_MAX_MATCH_LENGTH + 1 and
+	 * is here to aid binary search.  */
 };
 
 /* Table: length slot => number of extra length bits  */
@@ -347,19 +351,191 @@ lzms_get_num_offset_slots(size_t uncompressed_size)
 }
 
 void
-lzms_init_probability_entries(struct lzms_probability_entry *entries, size_t count)
+lzms_init_probabilities(struct lzms_probabilites *probs)
 {
-	for (size_t i = 0; i < count; i++) {
+	struct lzms_probability_entry *entries =
+		(struct lzms_probability_entry *)probs;
+	size_t num_entries = sizeof(struct lzms_probabilites) /
+			     sizeof(struct lzms_probability_entry);
+	for (size_t i = 0; i < num_entries; i++) {
 		entries[i].num_recent_zero_bits = LZMS_INITIAL_PROBABILITY;
 		entries[i].recent_bits = LZMS_INITIAL_RECENT_BITS;
 	}
 }
 
 void
-lzms_init_symbol_frequencies(u32 freqs[], size_t num_syms)
+lzms_init_symbol_frequencies(u32 freqs[], unsigned num_syms)
+{
+	for (unsigned sym = 0; sym < num_syms; sym++)
+		freqs[sym] = 1;
+}
+
+void
+lzms_dilute_symbol_frequencies(u32 freqs[], unsigned num_syms)
+{
+	for (unsigned sym = 0; sym < num_syms; sym++)
+		freqs[sym] = (freqs[sym] >> 1) + 1;
+}
+
+
+#ifdef __x86_64__
+static inline u8 *
+find_next_opcode_sse4_2(u8 *p)
+{
+	const __v16qi potential_opcodes = (__v16qi) {0x48, 0x4C, 0xE8, 0xE9, 0xF0, 0xFF};
+	__asm__(
+		"  pcmpestri $0x0, (%[p]), %[potential_opcodes]      \n"
+		"  jc 2f                                             \n"
+		"1:                                                  \n"
+		"  add $0x10, %[p]                                   \n"
+		"  pcmpestri $0x0, (%[p]), %[potential_opcodes]      \n"
+		"  jnc 1b                                            \n"
+		"2:                                                  \n"
+		"  add %%rcx, %[p]                                   \n"
+		: [p] "+r" (p)
+		: [potential_opcodes] "x" (potential_opcodes), "a" (6), "d" (16)
+		: "rcx", "cc"
+	       );
+
+	return p;
+}
+#endif /* __x86_64__ */
+
+static inline u8 *
+find_next_opcode_default(u8 *p)
 {
-	for (size_t i = 0; i < num_syms; i++)
-		freqs[i] = 1;
+	/*
+	 * The following table is used to accelerate the common case where the
+	 * byte has nothing to do with x86 translation and must simply be
+	 * skipped.  This was faster than the following alternatives:
+	 *	- Jump table with 256 entries
+	 *	- Switch statement with default
+	 */
+	static const u8 is_potential_opcode[256] = {
+		[0x48] = 1, [0x4C] = 1, [0xE8] = 1,
+		[0xE9] = 1, [0xF0] = 1, [0xFF] = 1,
+	};
+
+	for (;;) {
+		if (is_potential_opcode[*p])
+			break;
+		p++;
+		if (is_potential_opcode[*p])
+			break;
+		p++;
+		if (is_potential_opcode[*p])
+			break;
+		p++;
+		if (is_potential_opcode[*p])
+			break;
+		p++;
+	}
+	return p;
+}
+
+static inline u8 *
+translate_if_needed(u8 *data, u8 *p, s32 *last_x86_pos,
+		    s32 last_target_usages[], bool undo)
+{
+	s32 max_trans_offset;
+	s32 opcode_nbytes;
+	u16 target16;
+	s32 i;
+
+	max_trans_offset = LZMS_X86_MAX_TRANSLATION_OFFSET;
+
+	/*
+	 * p[0] has one of the following values:
+	 *	0x48 0x4C 0xE8 0xE9 0xF0 0xFF
+	 */
+
+	if (p[0] >= 0xF0) {
+		if (p[0] & 0x0F) {
+			/* 0xFF (instruction group)  */
+			if (p[1] == 0x15) {
+				/* Call indirect relative  */
+				opcode_nbytes = 2;
+				goto have_opcode;
+			}
+		} else {
+			/* 0xF0 (lock prefix)  */
+			if (p[1] == 0x83 && p[2] == 0x05) {
+				/* Lock add relative  */
+				opcode_nbytes = 3;
+				goto have_opcode;
+			}
+		}
+	} else if (p[0] <= 0x4C) {
+
+		/* 0x48 or 0x4C.  In 64-bit code this is a REX prefix byte with
+		 * W=1, R=[01], X=0, and B=0, and it will be followed by the
+		 * actual opcode, then additional bytes depending on the opcode.
+		 * We are most interested in several common instructions that
+		 * access data relative to the instruction pointer.  These use a
+		 * 1-byte opcode, followed by a ModR/M byte, followed by a
+		 * 4-byte displacement.  */
+
+		/* Test: does the ModR/M byte indicate RIP-relative addressing?
+		 * Note: there seems to be a mistake in the format here; the
+		 * mask really should be 0xC7 instead of 0x07 so that both the
+		 * MOD and R/M fields of ModR/M are tested, not just R/M.  */
+		if ((p[2] & 0x07) == 0x05) {
+			/* Check for the LEA (load effective address) or MOV
+			 * (move) opcodes.  For MOV there are additional
+			 * restrictions, although it seems they are only helpful
+			 * due to the overly lax ModR/M test.  */
+			if (p[1] == 0x8D ||
+			    (p[1] == 0x8B && !(p[0] & 0x04) && !(p[2] & 0xF0)))
+			{
+				opcode_nbytes = 3;
+				goto have_opcode;
+			}
+		}
+	} else {
+		if (p[0] & 0x01) {
+			/* 0xE9: Jump relative.  Theoretically this would be
+			 * useful to translate, but in fact it's explicitly
+			 * excluded.  Most likely it creates too many false
+			 * positives for the detection algorithm.  */
+			p += 4;
+		} else {
+			/* 0xE8: Call relative.  This is a common case, so it
+			 * uses a reduced max_trans_offset.  In other words, we
+			 * have to be more confident that the data actually is
+			 * x86 machine code before we'll do the translation.  */
+			opcode_nbytes = 1;
+			max_trans_offset >>= 1;
+			goto have_opcode;
+		}
+	}
+
+	return p + 1;
+
+have_opcode:
+	i = p - data;
+	p += opcode_nbytes;
+	if (undo) {
+		if (i - *last_x86_pos <= max_trans_offset) {
+			u32 n = get_unaligned_le32(p);
+			put_unaligned_le32(n - i, p);
+		}
+		target16 = i + get_unaligned_le16(p);
+	} else {
+		target16 = i + get_unaligned_le16(p);
+		if (i - *last_x86_pos <= max_trans_offset) {
+			u32 n = get_unaligned_le32(p);
+			put_unaligned_le32(n + i, p);
+		}
+	}
+
+	i += opcode_nbytes + sizeof(le32) - 1;
+
+	if (i - last_target_usages[target16] <= LZMS_X86_ID_WINDOW_SIZE)
+		*last_x86_pos = i;
+
+	last_target_usages[target16] = i;
+
+	return p + sizeof(le32);
 }
 
 /*
@@ -403,15 +579,14 @@ lzms_x86_filter(u8 data[restrict], s32 size,
 	 * low-order 2 bytes of each address are considered significant.
 	 */
 
-	s32 i;
-	s32 tail_idx;
-	u8 saved_byte;
-	s32 last_x86_pos;
+	u8 *p;
+	u8 *tail_ptr;
+	s32 last_x86_pos = -LZMS_X86_MAX_TRANSLATION_OFFSET - 1;
 
 	if (size <= 17)
 		return;
 
-	for (i = 0; i < 65536; i++)
+	for (s32 i = 0; i < 65536; i++)
 		last_target_usages[i] = -(s32)LZMS_X86_ID_WINDOW_SIZE - 1;
 
 	/*
@@ -429,133 +604,36 @@ lzms_x86_filter(u8 data[restrict], s32 size,
 	 *     data[(size - 16) + 7] and have no effect on the result, as long
 	 *     as we restore those bytes later.
 	 */
-	tail_idx = size - 16;
-	saved_byte = data[tail_idx + 8];
-	data[tail_idx + 8] = 0xE8;
-	last_x86_pos = -LZMS_X86_MAX_TRANSLATION_OFFSET - 1;
 
 	/* Note: the very first byte must be ignored completely!  */
-	i = 0;
-	for (;;) {
-		s32 max_trans_offset;
-		s32 opcode_nbytes;
-		u16 target16;
-
-		/*
-		 * The following table is used to accelerate the common case
-		 * where the byte has nothing to do with x86 translation and
-		 * must simply be skipped.  This is the fastest (at least on
-		 * x86_64) of the implementations I tested.  The other
-		 * implementations I tested were:
-		 *	- Jump table with 256 entries
-		 *	- Switch statement with default
-		 */
-		static const u8 is_potential_opcode[256] = {
-			[0x48] = 1, [0x4C] = 1, [0xE8] = 1,
-			[0xE9] = 1, [0xF0] = 1, [0xFF] = 1,
-		};
+	p = data + 1;
+	tail_ptr = &data[size - 16];
 
+#ifdef __x86_64__
+	if (x86_have_cpu_feature(X86_CPU_FEATURE_SSE4_2)) {
+		u8 saved_byte = *tail_ptr;
+		*tail_ptr = 0xE8;
 		for (;;) {
-			if (is_potential_opcode[data[++i]])
+			u8 *new_p = find_next_opcode_sse4_2(p);
+			if (new_p >= tail_ptr - 8)
 				break;
-			if (is_potential_opcode[data[++i]])
-				break;
-			if (is_potential_opcode[data[++i]])
-				break;
-			if (is_potential_opcode[data[++i]])
-				break;
-		}
-
-		if (i >= tail_idx)
-			break;
-
-		max_trans_offset = LZMS_X86_MAX_TRANSLATION_OFFSET;
-		switch (data[i]) {
-		case 0x48:
-			if (data[i + 1] == 0x8B) {
-				if (data[i + 2] == 0x5 || data[i + 2] == 0xD) {
-					/* Load relative (x86_64)  */
-					opcode_nbytes = 3;
-					goto have_opcode;
-				}
-			} else if (data[i + 1] == 0x8D) {
-				if ((data[i + 2] & 0x7) == 0x5) {
-					/* Load effective address relative (x86_64)  */
-					opcode_nbytes = 3;
-					goto have_opcode;
-				}
-			}
-			break;
-		case 0x4C:
-			if (data[i + 1] == 0x8D) {
-				if ((data[i + 2] & 0x7) == 0x5) {
-					/* Load effective address relative (x86_64)  */
-					opcode_nbytes = 3;
-					goto have_opcode;
-				}
-			}
-			break;
-		case 0xE8:
-			/* Call relative.  Note: 'max_trans_offset' must be
-			 * halved for this instruction.  This means that we must
-			 * be more confident that we are in a region of x86
-			 * machine code before we will do a translation for this
-			 * particular instruction.  */
-			opcode_nbytes = 1;
-			max_trans_offset /= 2;
-			goto have_opcode;
-		case 0xE9:
-			/* Jump relative  */
-			i += 4;
-			break;
-		case 0xF0:
-			if (data[i + 1] == 0x83 && data[i + 2] == 0x05) {
-				/* Lock add relative  */
-				opcode_nbytes = 3;
-				goto have_opcode;
-			}
-			break;
-		case 0xFF:
-			if (data[i + 1] == 0x15) {
-				/* Call indirect  */
-				opcode_nbytes = 2;
-				goto have_opcode;
-			}
-			break;
+			p = new_p;
+			p = translate_if_needed(data, p, &last_x86_pos,
+						last_target_usages, undo);
 		}
-
-		continue;
-
-	have_opcode:
-		if (undo) {
-			if (i - last_x86_pos <= max_trans_offset) {
-				LZMS_DEBUG("Undid x86 translation at position %d "
-					   "(opcode 0x%02x)", i, data[i]);
-				void *p32 = &data[i + opcode_nbytes];
-				u32 n = get_unaligned_u32_le(p32);
-				put_unaligned_u32_le(n - i, p32);
-			}
-			target16 = i + get_unaligned_u16_le(&data[i + opcode_nbytes]);
-		} else {
-			target16 = i + get_unaligned_u16_le(&data[i + opcode_nbytes]);
-			if (i - last_x86_pos <= max_trans_offset) {
-				LZMS_DEBUG("Did x86 translation at position %d "
-					   "(opcode 0x%02x)", i, data[i]);
-				void *p32 = &data[i + opcode_nbytes];
-				u32 n = get_unaligned_u32_le(p32);
-				put_unaligned_u32_le(n + i, p32);
-			}
+		*tail_ptr = saved_byte;
+	}
+#endif
+	{
+		u8 saved_byte = *(tail_ptr + 8);
+		*(tail_ptr + 8) = 0xE8;
+		for (;;) {
+			p = find_next_opcode_default(p);
+			if (p >= tail_ptr)
+				break;
+			p = translate_if_needed(data, p, &last_x86_pos,
+						last_target_usages, undo);
 		}
-
-		i += opcode_nbytes + sizeof(le32) - 1;
-
-		if (i - last_target_usages[target16] <= LZMS_X86_ID_WINDOW_SIZE)
-			last_x86_pos = i;
-
-		last_target_usages[target16] = i;
-
-		continue;
+		*(tail_ptr + 8) = saved_byte;
 	}
-
-	data[tail_idx + 8] = saved_byte;
 }