+static inline void
+copy_word_unaligned(const void *src, void *dst)
+{
+ store_word_unaligned(load_word_unaligned(src), dst);
+}
+
+static inline machine_word_t
+repeat_byte(u8 b)
+{
+ machine_word_t v;
+
+ STATIC_ASSERT(WORDSIZE == 4 || WORDSIZE == 8);
+
+ v = b;
+ v |= v << 8;
+ v |= v << 16;
+ v |= v << ((WORDSIZE == 8) ? 32 : 0);
+ return v;
+}
+
+/*
+ * Copy an LZ77 match at (dst - offset) to dst.
+ *
+ * The length and offset must be already validated --- that is, (dst - offset)
+ * can't underrun the output buffer, and (dst + length) can't overrun the output
+ * buffer. Also, the length cannot be 0.
+ *
+ * @winend points to the byte past the end of the output buffer.
+ * This function won't write any data beyond this position.
+ */
+static inline void
+lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend, u32 min_length)
+{
+ const u8 *src = dst - offset;
+ const u8 * const end = dst + length;
+
+ /*
+ * Try to copy one machine word at a time. On i386 and x86_64 this is
+ * faster than copying one byte at a time, unless the data is
+ * near-random and all the matches have very short lengths. Note that
+ * since this requires unaligned memory accesses, it won't necessarily
+ * be faster on every architecture.
+ *
+ * Also note that we might copy more than the length of the match. For
+ * example, if a word is 8 bytes and the match is of length 5, then
+ * we'll simply copy 8 bytes. This is okay as long as we don't write
+ * beyond the end of the output buffer, hence the check for (winend -
+ * end >= WORDSIZE - 1).
+ */
+ if (UNALIGNED_ACCESS_IS_VERY_FAST &&
+ likely(winend - end >= WORDSIZE - 1))
+ {
+
+ if (offset >= WORDSIZE) {
+ /* The source and destination words don't overlap. */
+
+ /* To improve branch prediction, one iteration of this
+ * loop is unrolled. Most matches are short and will
+ * fail the first check. But if that check passes, then
+ * it becomes increasing likely that the match is long
+ * and we'll need to continue copying. */
+
+ copy_word_unaligned(src, dst);
+ src += WORDSIZE;
+ dst += WORDSIZE;
+
+ if (dst < end) {
+ do {
+ copy_word_unaligned(src, dst);
+ src += WORDSIZE;
+ dst += WORDSIZE;
+ } while (dst < end);
+ }
+ return;
+ } else if (offset == 1) {
+
+ /* Offset 1 matches are equivalent to run-length
+ * encoding of the previous byte. This case is common
+ * if the data contains many repeated bytes. */
+
+ machine_word_t v = repeat_byte(*(dst - 1));
+ do {
+ store_word_unaligned(v, dst);
+ src += WORDSIZE;
+ dst += WORDSIZE;
+ } while (dst < end);
+ return;
+ }
+ /*
+ * We don't bother with special cases for other 'offset <
+ * WORDSIZE', which are usually rarer than 'offset == 1'. Extra
+ * checks will just slow things down. Actually, it's possible
+ * to handle all the 'offset < WORDSIZE' cases using the same
+ * code, but it still becomes more complicated doesn't seem any
+ * faster overall; it definitely slows down the more common
+ * 'offset == 1' case.
+ */
+ }
+
+ /* Fall back to a bytewise copy. */
+
+ if (min_length >= 2) {
+ *dst++ = *src++;
+ length--;
+ }
+ if (min_length >= 3) {
+ *dst++ = *src++;
+ length--;
+ }
+ if (min_length >= 4) {
+ *dst++ = *src++;
+ length--;
+ }
+ do {
+ *dst++ = *src++;
+ } while (--length);
+}