+ /* To improve branch prediction, one iteration of this
+ * loop is unrolled. Most matches are short and will
+ * fail the first check. But if that check passes, then
+ * it becomes increasing likely that the match is long
+ * and we'll need to continue copying. */
+
+ copy_word_unaligned(src, dst);
+ src += WORDSIZE;
+ dst += WORDSIZE;
+
+ if (dst < end) {
+ do {
+ copy_word_unaligned(src, dst);
+ src += WORDSIZE;
+ dst += WORDSIZE;
+ } while (dst < end);
+ }
+ return;
+ } else if (offset == 1) {
+
+ /* Offset 1 matches are equivalent to run-length
+ * encoding of the previous byte. This case is common
+ * if the data contains many repeated bytes. */
+
+ machine_word_t v = repeat_byte(*(dst - 1));
+ do {
+ store_word_unaligned(v, dst);
+ src += WORDSIZE;
+ dst += WORDSIZE;
+ } while (dst < end);
+ return;
+ }
+ /*
+ * We don't bother with special cases for other 'offset <
+ * WORDSIZE', which are usually rarer than 'offset == 1'. Extra
+ * checks will just slow things down. Actually, it's possible
+ * to handle all the 'offset < WORDSIZE' cases using the same
+ * code, but it still becomes more complicated doesn't seem any
+ * faster overall; it definitely slows down the more common
+ * 'offset == 1' case.
+ */
+ }
+
+ /* Fall back to a bytewise copy. */
+
+ if (min_length >= 2) {
+ *dst++ = *src++;
+ length--;
+ }
+ if (min_length >= 3) {
+ *dst++ = *src++;
+ length--;
+ }
+ if (min_length >= 4) {
+ *dst++ = *src++;
+ length--;