- * example, if an 'unsigned long' is 8 bytes and the match is of length
- * 5, then we'll simply copy 8 bytes. This is okay as long as we don't
- * write beyond the end of the output buffer, hence the check for
- * (winend - (dst + length) >= sizeof(unsigned long) - 1). */
- if (offset >= sizeof(unsigned long) &&
- winend - (dst + length) >= sizeof(unsigned long) - 1)
- {
- /* Access memory through a packed struct. This tricks the
- * compiler into allowing unaligned memory accesses. */
- struct ulong_wrapper {
- unsigned long v;
- } _packed_attribute;
-
- const u8 *end = dst + length;
- do {
- unsigned long v = ((struct ulong_wrapper *)src)->v;
- ((struct ulong_wrapper *)dst)->v = v;
- dst += sizeof(unsigned long);
- src += sizeof(unsigned long);
- } while (dst < end);
+ * example, if a word is 8 bytes and the match is of length 5, then
+ * we'll simply copy 8 bytes. This is okay as long as we don't write
+ * beyond the end of the output buffer, hence the check for (winend -
+ * end >= WORDBYTES - 1).
+ */
+ if (UNALIGNED_ACCESS_IS_FAST && likely(winend - end >= WORDBYTES - 1)) {
+
+ if (offset >= WORDBYTES) {
+ /* The source and destination words don't overlap. */
+
+ /* To improve branch prediction, one iteration of this
+ * loop is unrolled. Most matches are short and will
+ * fail the first check. But if that check passes, then
+ * it becomes increasing likely that the match is long
+ * and we'll need to continue copying. */
+
+ copy_word_unaligned(src, dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+
+ if (dst < end) {
+ do {
+ copy_word_unaligned(src, dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+ } while (dst < end);
+ }
+ return;
+ } else if (offset == 1) {
+
+ /* Offset 1 matches are equivalent to run-length
+ * encoding of the previous byte. This case is common
+ * if the data contains many repeated bytes. */
+
+ machine_word_t v = repeat_byte(*(dst - 1));
+ do {
+ store_word_unaligned(v, dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+ } while (dst < end);
+ return;
+ }
+ /*
+ * We don't bother with special cases for other 'offset <
+ * WORDBYTES', which are usually rarer than 'offset == 1'.
+ * Extra checks will just slow things down. Actually, it's
+ * possible to handle all the 'offset < WORDBYTES' cases using
+ * the same code, but it still becomes more complicated doesn't
+ * seem any faster overall; it definitely slows down the more
+ * common 'offset == 1' case.
+ */
+ }