+lzms_init_symbol_frequencies(u32 freqs[], unsigned num_syms)
+{
+ for (unsigned sym = 0; sym < num_syms; sym++)
+ freqs[sym] = 1;
+}
+
+void
+lzms_dilute_symbol_frequencies(u32 freqs[], unsigned num_syms)
+{
+ for (unsigned sym = 0; sym < num_syms; sym++)
+ freqs[sym] = (freqs[sym] >> 1) + 1;
+}
+
+
+#ifdef __x86_64__
+static inline u8 *
+find_next_opcode_sse4_2(u8 *p)
+{
+ const __v16qi potential_opcodes = (__v16qi) {0x48, 0x4C, 0xE8, 0xE9, 0xF0, 0xFF};
+ __asm__(
+ " pcmpestri $0x0, (%[p]), %[potential_opcodes] \n"
+ " jc 2f \n"
+ "1: \n"
+ " add $0x10, %[p] \n"
+ " pcmpestri $0x0, (%[p]), %[potential_opcodes] \n"
+ " jnc 1b \n"
+ "2: \n"
+ " add %%rcx, %[p] \n"
+ : [p] "+r" (p)
+ : [potential_opcodes] "x" (potential_opcodes), "a" (6), "d" (16)
+ : "rcx", "cc"
+ );
+
+ return p;
+}
+#endif /* __x86_64__ */
+
+static inline u8 *
+find_next_opcode_default(u8 *p)
+{
+ /*
+ * The following table is used to accelerate the common case where the
+ * byte has nothing to do with x86 translation and must simply be
+ * skipped. This was faster than the following alternatives:
+ * - Jump table with 256 entries
+ * - Switch statement with default
+ */
+ static const u8 is_potential_opcode[256] = {
+ [0x48] = 1, [0x4C] = 1, [0xE8] = 1,
+ [0xE9] = 1, [0xF0] = 1, [0xFF] = 1,
+ };
+
+ for (;;) {
+ if (is_potential_opcode[*p])
+ break;
+ p++;
+ if (is_potential_opcode[*p])
+ break;
+ p++;
+ if (is_potential_opcode[*p])
+ break;
+ p++;
+ if (is_potential_opcode[*p])
+ break;
+ p++;
+ }
+ return p;
+}
+
+static inline u8 *
+translate_if_needed(u8 *data, u8 *p, s32 *last_x86_pos,
+ s32 last_target_usages[], bool undo)