Fix wlfuzz and enable in CI

[wimlib] / src / sha1.c
diff --git a/src/sha1.c b/src/sha1.c

index b0746bce507610aa14af6a7f46a21d67ef91ff05..fbbfdc18d1bdde0a64bd57981c37e3cdb1aa4665 100644 (file)
--- a/src/sha1.c
+++ b/src/sha1.c
@@ -1,507 +1,642 @@
-/* sha1.c - Functions to compute SHA1 message digest of files or
-   memory blocks according to the NIST specification FIPS-180-1.
-
-   Copyright (C) 2000-2001, 2003-2006, 2008-2011 Free Software Foundation, Inc.
-
-   This program is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published by the
-   Free Software Foundation; either version 3, or (at your option) any
-   later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software Foundation,
-   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
-
-/* Written by Scott G. Miller
-   Credits:
-      Robert Klep <robert@ilse.nl>  -- Expansion function fix
-
-   Modified by Eric Biggers for wimlib:  Conditionally compile in the use of
-   OpenSSL or Intel's assembly code for SHA1 block updates
-*/
-
-#include "util.h"
-#include "wimlib.h"
-#include "sha1.h"
-#include "endianness.h"
-#include <string.h>
-
-#define SWAP(n) to_be32(n)
-
-#define BLOCKSIZE 32768
-#if BLOCKSIZE % 64 != 0
-#error "invalid BLOCKSIZE"
+/*
+ * sha1.c - implementation of the Secure Hash Algorithm version 1 (FIPS 180-1)
+ *
+ * Copyright 2022-2023 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
  #endif
  
+#include "wimlib/cpu_features.h"
+#include "wimlib/endianness.h"
+#include "wimlib/sha1.h"
+#include "wimlib/unaligned.h"
  
-#ifdef WITH_LIBCRYPTO
+/*----------------------------------------------------------------------------*
+ *                              Shared helpers                                *
+ *----------------------------------------------------------------------------*/
  
-static inline void sha1_init_ctx(SHA_CTX *ctx)
+static inline u32
+rol32(u32 v, int bits)
  {
-       SHA1_Init(ctx);
+       return (v << bits) | (v >> (32 - bits));
  }
  
-static inline void sha1_process_block(const void *buffer, size_t len, 
-                                     SHA_CTX *ctx)
-{
-       SHA1_Update(ctx, buffer, len);
-}
+/* Expands to the round constant for the given round */
+#define SHA1_K(i)                      \
+       (((i) < 20) ? 0x5A827999 :      \
+        ((i) < 40) ? 0x6ED9EBA1 :      \
+        ((i) < 60) ? 0x8F1BBCDC :      \
+                     0xCA62C1D6)
+
+/* Expands to the computation on b, c, and d for the given round */
+#define SHA1_F(i, b, c, d)                                     \
+       (((i) < 20) ? /* Choice */ (b & (c ^ d)) ^ d :          \
+        ((i) < 40) ? /* Parity */ b ^ c ^ d :                  \
+        ((i) < 60) ? /* Majority */ (c & d) ^ (b & (c ^ d)) :  \
+                     /* Parity */ b ^ c ^ d)
+
+/*
+ * Expands to a memory barrier for the given array, preventing values of the
+ * array from being cached in registers past the barrier.  Use this to prevent
+ * the compiler from making counter-productive optimizations when there aren't
+ * enough registers available to hold the full array.
+ */
+#define FORCE_NOT_CACHED(array)        asm volatile("" : "+m" (array))
+
+/*
+ * Expands to FORCE_NOT_CACHED() if the architecture has 16 or fewer general
+ * purpose registers, otherwise does nothing.
+ */
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__)
+#  define FORCE_NOT_CACHED_IF_FEW_REGS(array)  FORCE_NOT_CACHED(array)
+#else
+#  define FORCE_NOT_CACHED_IF_FEW_REGS(array)  (void)(array)
+#endif
  
-static inline void sha1_process_bytes(const void *buffer, size_t len, 
-                                     SHA_CTX *ctx)
+/*----------------------------------------------------------------------------*
+ *                         Generic implementation                             *
+ *----------------------------------------------------------------------------*/
+
+/*
+ * This is SHA-1 in portable C code.  It computes the message schedule
+ * just-in-time, in a rolling window of length 16.
+ */
+
+#define SHA1_GENERIC_ROUND(i, a, b, c, d, e)                           \
+       FORCE_NOT_CACHED_IF_FEW_REGS(w);                                \
+       if ((i) < 16)                                                   \
+               w[i] = get_unaligned_be32(data + ((i) * 4));            \
+       else                                                            \
+               w[(i) % 16] = rol32(w[((i) - 16) % 16] ^                \
+                                   w[((i) - 14) % 16] ^                \
+                                   w[((i) -  8) % 16] ^                \
+                                   w[((i) -  3) % 16], 1);             \
+       e += w[(i) % 16] + rol32(a, 5) + SHA1_F((i), b, c, d) + SHA1_K(i); \
+       b = rol32(b, 30);
+       /* implicit: the new (a, b, c, d, e) is the old (e, a, b, c, d) */
+
+#define SHA1_GENERIC_5ROUNDS(i)                                \
+       SHA1_GENERIC_ROUND((i) + 0, a, b, c, d, e);     \
+       SHA1_GENERIC_ROUND((i) + 1, e, a, b, c, d);     \
+       SHA1_GENERIC_ROUND((i) + 2, d, e, a, b, c);     \
+       SHA1_GENERIC_ROUND((i) + 3, c, d, e, a, b);     \
+       SHA1_GENERIC_ROUND((i) + 4, b, c, d, e, a);
+
+#define SHA1_GENERIC_20ROUNDS(i)       \
+       SHA1_GENERIC_5ROUNDS((i) +  0); \
+       SHA1_GENERIC_5ROUNDS((i) +  5); \
+       SHA1_GENERIC_5ROUNDS((i) + 10); \
+       SHA1_GENERIC_5ROUNDS((i) + 15);
+
+static void
+sha1_blocks_generic(u32 h[5], const void *data, size_t num_blocks)
  {
-       SHA1_Update(ctx, buffer, len);
+       do {
+               u32 a = h[0];
+               u32 b = h[1];
+               u32 c = h[2];
+               u32 d = h[3];
+               u32 e = h[4];
+               u32 w[16];
+
+               SHA1_GENERIC_20ROUNDS(0);
+               SHA1_GENERIC_20ROUNDS(20);
+               SHA1_GENERIC_20ROUNDS(40);
+               SHA1_GENERIC_20ROUNDS(60);
+
+               h[0] += a;
+               h[1] += b;
+               h[2] += c;
+               h[3] += d;
+               h[4] += e;
+               data += SHA1_BLOCK_SIZE;
+       } while (--num_blocks);
  }
  
+/*----------------------------------------------------------------------------*
+ *                    x86 SSSE3 (and AVX+BMI2) implementation                 *
+ *----------------------------------------------------------------------------*/
+
+/*
+ * This is SHA-1 using the x86 SSSE3 instructions.  A copy of it is also
+ * compiled with AVX and BMI2 code generation enabled for improved performance.
+ *
+ * Unfortunately this isn't actually much faster than the generic
+ * implementation, since only the message schedule can be vectorized, not the
+ * SHA itself.  The vectorized computation of the message schedule is
+ * interleaved with the scalar computation of the SHA itself.
+ *
+ * Specifically, 16 rounds ahead of time, the words of the message schedule are
+ * calculated, the round constants are added to them, and they are stored in a
+ * temporary array that the scalar code reads from later.  This is done 4 words
+ * at a time, but split into 4 steps, so that one step is executed during each
+ * round.  Rounds 16-31 use the usual formula 'w[i] = rol32(w[i-16] ^ w[i-14] ^
+ * w[i-8] ^ w[i-3], 1)', while rounds 32-79 use the equivalent formula 'w[i] =
+ * rol32(w[i-32] ^ w[i-28] ^ w[i-16] ^ w[i-6], 2)' for improved vectorization.
+ *
+ * During rounds 80-95, the first 16 message schedule words for the next block
+ * are prepared.
+ */
+#if defined(__i386__) || defined(__x86_64__)
+#include <immintrin.h>
+
+#define SHA1_SSSE3_PRECALC(i, w0, w1, w2, w3, w4, w5, w6, w7)          \
+       if ((i) % 20 == 0)                                              \
+               k = _mm_set1_epi32(SHA1_K((i) % 80));                   \
+       if ((i) < 32) {                                                 \
+               /*
+                * Vectorized computation of w[i] = rol32(w[i-16] ^ w[i-14] ^
+                * w[i-8] ^ w[i-3], 1) for i...i+3, split into 4 steps.
+                * w[i-16..i+3] are in (w0, w1, w2, w3, w4).
+                */                                                     \
+               if ((i) % 4 == 0) {                                     \
+                       w4 = _mm_alignr_epi8(w1, w0, 8) ^ w2;           \
+                       t0 = _mm_srli_si128(w3, 4);                     \
+               } else if ((i) % 4 == 1) {                              \
+                       t0 ^= w4 ^ w0;                                  \
+                       t1 = _mm_slli_si128(t0, 12);                    \
+               } else if ((i) % 4 == 2) {                              \
+                       t2 = _mm_slli_epi32(t1, 2);                     \
+                       w4 = _mm_slli_epi32(t0, 1);                     \
+                       t0 = _mm_srli_epi32(t0, 31);                    \
+                       t2 ^= _mm_srli_epi32(t1, 30);                   \
+               } else {                                                \
+                       w4 ^= t0 ^ t2;                                  \
+                       t0 = _mm_add_epi32(w4, k);                      \
+                       _mm_store_si128((__m128i *)&tmp[((i) - 3) % 16], t0);   \
+               }                                                       \
+       } else if ((i) < 80) {                                          \
+               /*
+                * Vectorized computation of w[i] = rol32(w[i-32] ^ w[i-28] ^
+                * w[i-16] ^ w[i-6], 2) for i...i+3, split into 4 steps.
+                * w[i-32..i+3] are in (w4, w5, w6, w7, w0, w1, w2, w3, w4);
+                * note the reuse of w4.
+                */                                                     \
+               if ((i) % 4 == 0)                                       \
+                       w4 ^= _mm_alignr_epi8(w3, w2, 8);               \
+               else if ((i) % 4 == 1)                                  \
+                       w4 ^= w5 ^ w0;                                  \
+               else if ((i) % 4 == 2)                                  \
+                       w4 = _mm_slli_epi32(w4, 2) ^                    \
+                            _mm_srli_epi32(w4, 30);                    \
+               else                                                    \
+                       _mm_store_si128((__m128i *)&tmp[((i) - 3) % 16],\
+                                       _mm_add_epi32(w4, k));          \
+       } else if ((i) < 96) {                                          \
+               /* Precomputation of w[0..15] for next block */         \
+               if ((i) == 80 && --num_blocks != 0)                     \
+                       data += SHA1_BLOCK_SIZE;                        \
+               if ((i) % 4 == 0)                                       \
+                       w0 = _mm_loadu_si128(data + (((i) - 80) * 4));  \
+               else if ((i) % 4 == 1)                                  \
+                       w0 = _mm_shuffle_epi8(w0, bswap32_mask);        \
+               else if ((i) % 4 == 2)                                  \
+                       t0 = _mm_add_epi32(w0, k);                      \
+               else                                                    \
+                       _mm_store_si128((__m128i *)&tmp[(i) - 83], t0); \
+       }
  
-static inline void *sha1_finish_ctx(SHA_CTX *ctx, void *resbuf)
+#define SHA1_SSSE3_2ROUNDS(i, a, b, c, d, e, w0, w1, w2, w3, w4, w5, w6, w7) \
+       FORCE_NOT_CACHED(tmp);                                          \
+       e += tmp[(i) % 16] + rol32(a, 5) + SHA1_F((i), b, c, d);        \
+       b = rol32(b, 30);                                               \
+       SHA1_SSSE3_PRECALC((i) + 16, w0, w1, w2, w3, w4, w5, w6, w7);   \
+       FORCE_NOT_CACHED(tmp);                                          \
+       d += tmp[((i) + 1) % 16] + rol32(e, 5) + SHA1_F((i) + 1, a, b, c); \
+       SHA1_SSSE3_PRECALC((i) + 17, w0, w1, w2, w3, w4, w5, w6, w7);   \
+       a = rol32(a, 30);
+       /* implicit: the new (a, b, c, d, e) is the old (d, e, a, b, c) */
+
+#define SHA1_SSSE3_4ROUNDS(i, a, b, c, d, e, w0, w1, w2, w3, w4, w5, w6, w7)   \
+       SHA1_SSSE3_2ROUNDS((i) + 0, a, b, c, d, e, w0, w1, w2, w3, w4, w5, w6, w7); \
+       SHA1_SSSE3_2ROUNDS((i) + 2, d, e, a, b, c, w0, w1, w2, w3, w4, w5, w6, w7); \
+       /*
+        * implicit: the new (w0-w7) is the old (w1-w7,w0),
+        * and the new (a, b, c, d, e) is the old (b, c, d, e, a)
+        */
+
+#define SHA1_SSSE3_20ROUNDS(i, w0, w1, w2, w3, w4, w5, w6, w7)         \
+       SHA1_SSSE3_4ROUNDS((i) +  0, a, b, c, d, e, w0, w1, w2, w3, w4, w5, w6, w7); \
+       SHA1_SSSE3_4ROUNDS((i) +  4, b, c, d, e, a, w1, w2, w3, w4, w5, w6, w7, w0); \
+       SHA1_SSSE3_4ROUNDS((i) +  8, c, d, e, a, b, w2, w3, w4, w5, w6, w7, w0, w1); \
+       SHA1_SSSE3_4ROUNDS((i) + 12, d, e, a, b, c, w3, w4, w5, w6, w7, w0, w1, w2); \
+       SHA1_SSSE3_4ROUNDS((i) + 16, e, a, b, c, d, w4, w5, w6, w7, w0, w1, w2, w3);
+       /* implicit: the new (w0-w7) is the old (w5-w7,w0-w4) */
+
+#define SHA1_SSSE3_BODY                                                        \
+       const __m128i bswap32_mask =                                    \
+               _mm_setr_epi8( 3,  2,  1,  0,  7,  6,  5,  4,           \
+                             11, 10,  9,  8, 15, 14, 13, 12);          \
+       __m128i w0, w1, w2, w3, w4, w5, w6, w7;                         \
+       __m128i k = _mm_set1_epi32(SHA1_K(0));                          \
+       u32 tmp[16] __attribute__((aligned(16)));                       \
+                                                                       \
+       w0 = _mm_shuffle_epi8(_mm_loadu_si128(data +  0), bswap32_mask); \
+       w1 = _mm_shuffle_epi8(_mm_loadu_si128(data + 16), bswap32_mask); \
+       w2 = _mm_shuffle_epi8(_mm_loadu_si128(data + 32), bswap32_mask); \
+       w3 = _mm_shuffle_epi8(_mm_loadu_si128(data + 48), bswap32_mask); \
+       _mm_store_si128((__m128i *)&tmp[0], _mm_add_epi32(w0, k));      \
+       _mm_store_si128((__m128i *)&tmp[4], _mm_add_epi32(w1, k));      \
+       _mm_store_si128((__m128i *)&tmp[8], _mm_add_epi32(w2, k));      \
+       _mm_store_si128((__m128i *)&tmp[12], _mm_add_epi32(w3, k));     \
+                                                                       \
+       do {                                                            \
+               u32 a = h[0];                                           \
+               u32 b = h[1];                                           \
+               u32 c = h[2];                                           \
+               u32 d = h[3];                                           \
+               u32 e = h[4];                                           \
+               __m128i t0, t1, t2;                                     \
+                                                                       \
+               SHA1_SSSE3_20ROUNDS(0, w0, w1, w2, w3, w4, w5, w6, w7); \
+               SHA1_SSSE3_20ROUNDS(20, w5, w6, w7, w0, w1, w2, w3, w4); \
+               SHA1_SSSE3_20ROUNDS(40, w2, w3, w4, w5, w6, w7, w0, w1); \
+               SHA1_SSSE3_20ROUNDS(60, w7, w0, w1, w2, w3, w4, w5, w6); \
+                                                                       \
+               h[0] += a;                                              \
+               h[1] += b;                                              \
+               h[2] += c;                                              \
+               h[3] += d;                                              \
+               h[4] += e;                                              \
+                                                                       \
+               /* 'data' and 'num_blocks' were updated at start of round 64. */ \
+       } while (num_blocks);
+
+#define HAVE_SHA1_BLOCKS_X86_SSSE3
+static void __attribute__((target("ssse3")))
+sha1_blocks_x86_ssse3(u32 h[5], const void *data, size_t num_blocks)
  {
-       SHA1_Final(resbuf, ctx);
+       SHA1_SSSE3_BODY;
  }
-#else /* WITH_LIBCRYPTO */
-
-/* Structure to save state of computation between the single steps.  */
-struct sha1_ctx {
-       uint32_t A;
-       uint32_t B;
-       uint32_t C;
-       uint32_t D;
-       uint32_t E;
  
-       uint32_t total[2];
-       uint32_t buflen;
-       uint32_t buffer[32];
-};
-
-typedef struct sha1_ctx SHA_CTX;
-
-#ifdef ENABLE_SSSE3_SHA1
-extern void sha1_update_intel(int *hash, const char* input, size_t num_blocks);
-
-static inline void sha1_process_block(const void *buffer, size_t len, 
-                                     SHA_CTX *ctx)
+#define HAVE_SHA1_BLOCKS_X86_AVX_BMI2
+static void __attribute__((target("avx,bmi2")))
+sha1_blocks_x86_avx_bmi2(u32 h[5], const void *data, size_t num_blocks)
  {
-       sha1_update_intel((int*)ctx, buffer, len / 64);
-       ctx->total[0] += len;
-       if (ctx->total[0] < len)
-               ++ctx->total[1];
+       SHA1_SSSE3_BODY;
  }
-
-#include <stdlib.h>
-void ssse3_not_found()
+#endif /* x86 SSSE3 (and AVX+BMI2) implementation */
+
+/*----------------------------------------------------------------------------*
+ *                        x86 SHA Extensions implementation                   *
+ *----------------------------------------------------------------------------*/
+
+/*
+ * This is SHA-1 using the x86 SHA extensions.
+ *
+ * The SHA1RNDS4 instruction does most of the work.  It takes in a 128-bit
+ * vector containing 'a', 'b', 'c', and 'd' (high-order to low-order), a 128-bit
+ * vector containing the next 4 words of the message schedule with 'e' added to
+ * the high-order word, and an immediate that identifies the current 20-round
+ * section.  It does 4 rounds and updates 'a', 'b', 'c', and 'd' accordingly.
+ *
+ * Each SHA1RNDS4 is paired with SHA1NEXTE.  It takes in the abcd vector,
+ * calculates the value of 'e' after 4 rounds, and adds it to the high-order
+ * word of a vector that contains the next 4 words of the message schedule.
+ *
+ * Each 4 words of the message schedule for rounds 16-79 is calculated as
+ * rol32(w[i-16] ^ w[i-14] ^ w[i-8] ^ w[i-3], 1) in three steps using the
+ * SHA1MSG1, PXOR, and SHA1MSG2 instructions.  This happens in a rolling window,
+ * so during the j'th set of 4 rounds we do the SHA1MSG2 step for j+1'th set of
+ * message schedule words, PXOR for j+2'th set, and SHA1MSG1 for the j+3'th set.
+ */
+#if defined(__i386__) || defined(__x86_64__)
+#include <immintrin.h>
+
+#define SHA1_NI_4ROUNDS(i, w0, w1, w2, w3, we0, we1)                   \
+       if ((i) < 16)                                                   \
+               w0 = _mm_shuffle_epi8(                                  \
+                       _mm_loadu_si128(data + ((i) * 4)), bswap_mask); \
+       if ((i) == 0)                                                   \
+               we0 = _mm_add_epi32(h_e, w0);                           \
+       else                                                            \
+               we0 = _mm_sha1nexte_epu32(/* old abcd */ we0, w0);      \
+       we1 = abcd;                                                     \
+       if ((i) >= 12 && (i) < 76)                                      \
+               w1 = _mm_sha1msg2_epu32(w1, w0);                        \
+       abcd = _mm_sha1rnds4_epu32(abcd, we0, (i) / 20);                \
+       if ((i) >= 8 && (i) < 72)                                       \
+               w2 ^= w0;                                               \
+       if ((i) >= 4 && (i) < 68)                                       \
+               w3 = _mm_sha1msg1_epu32(w3, w0);                        \
+       /*
+        * implicit: the new (w0, w1, w2, w3) is the old (w1, w2, w3, w0),
+        * and the new (we0, we1) is the old (we1, we0)
+        */
+
+#define SHA1_NI_16ROUNDS(i)                                    \
+       SHA1_NI_4ROUNDS((i) +  0, w0, w1, w2, w3, we0, we1);    \
+       SHA1_NI_4ROUNDS((i) +  4, w1, w2, w3, w0, we1, we0);    \
+       SHA1_NI_4ROUNDS((i) +  8, w2, w3, w0, w1, we0, we1);    \
+       SHA1_NI_4ROUNDS((i) + 12, w3, w0, w1, w2, we1, we0);
+
+#define HAVE_SHA1_BLOCKS_X86_SHA
+static void __attribute__((target("sha,sse4.1")))
+sha1_blocks_x86_sha(u32 h[5], const void *data, size_t num_blocks)
  {
-       fprintf(stderr, 
-"Cannot calculate SHA1 message digest: CPU does not support SSSE3\n"
-"instructions!  Recompile wimlib without the --enable-ssse3-sha1 flag\n"
-"to use wimlib on this CPU.\n");
-       abort();
+       const __m128i bswap_mask =
+               _mm_setr_epi8(15, 14, 13, 12, 11, 10,  9,  8,
+                             7,  6,   5,  4,  3,  2,  1,  0);
+       __m128i h_abcd = _mm_shuffle_epi32(
+                               _mm_loadu_si128((__m128i *)h), 0x1B);
+       __m128i h_e = _mm_setr_epi32(0, 0, 0, h[4]);
+
+       do {
+               __m128i abcd = h_abcd;
+               __m128i w0, w1, w2, w3, we0, we1;
+
+               SHA1_NI_16ROUNDS(0);
+               SHA1_NI_16ROUNDS(16);
+               SHA1_NI_16ROUNDS(32);
+               SHA1_NI_16ROUNDS(48);
+               SHA1_NI_16ROUNDS(64);
+
+               h_abcd = _mm_add_epi32(h_abcd, abcd);
+               h_e = _mm_sha1nexte_epu32(we0, h_e);
+               data += SHA1_BLOCK_SIZE;
+       } while (--num_blocks);
+
+       _mm_storeu_si128((__m128i *)h, _mm_shuffle_epi32(h_abcd, 0x1B));
+       h[4] = _mm_extract_epi32(h_e, 3);
  }
-#else /* ENABLE_SSSE3_SHA1 */
-
-static void sha1_process_block(const void *buffer, size_t len,
-                              SHA_CTX *ctx);
-
-#endif /* ENABLE_SSSE3_SHA1 */
-
-
-/* This array contains the bytes used to pad the buffer to the next
-   64-byte boundary.  (RFC 1321, 3.1: Step 1)  */
-static const u8 fillbuf[64] = { 0x80, 0 /* , 0, 0, ...  */  };
-
-/* Initialize structure containing state of computation. */
-static void sha1_init_ctx(SHA_CTX *ctx);
-
-/* Starting with the result of former calls of this function (or the
-   initialization function update the context for the next LEN bytes
-   starting at BUFFER.
-   It is NOT required that LEN is a multiple of 64.  */
-static void sha1_process_bytes(const void *buffer, size_t len,
-                              SHA_CTX *ctx);
-
-/* Process the remaining bytes in the buffer and put result from CTX
-   in first 20 bytes following RESBUF.  The result is always in little
-   endian byte order, so that a byte-wise output yields to the wanted
-   ASCII representation of the message digest.  */
-static void *sha1_finish_ctx(SHA_CTX *ctx, void *resbuf);
-
-/* Put result from CTX in first 20 bytes following RESBUF.  The result is
-   always in little endian byte order, so that a byte-wise output yields
-   to the wanted ASCII representation of the message digest.  */
-static void *sha1_read_ctx(const SHA_CTX *ctx, void *resbuf);
-
-#endif /* WITH_LIBCRYPTO */
-
-
+#endif /* x86 SHA Extensions implementation */
+
+/*----------------------------------------------------------------------------*
+ *                     ARMv8 Crypto Extensions implementation                 *
+ *----------------------------------------------------------------------------*/
+
+/*
+ * This is SHA-1 using the ARMv8 Crypto Extensions.
+ *
+ * This does 4 rounds at a time, and it works very similarily to the x86 SHA
+ * Extensions implementation.  The differences are fairly minor:
+ *
+ * - x86 has SHA1RNDS4 that takes an immediate that identifies the set of 20
+ *   rounds, and it handles adding the round constants.  ARM has SHA1C for
+ *   rounds 0-19, SHA1P for rounds 20-39 and 60-79, and SHA1M for rounds 40-59.
+ *   These don't add the round constants, so that must be done separately.
+ *
+ * - ARM needs only two instructions, instead of x86's three, to prepare each
+ *   set of 4 message schedule words: SHA1SU0 which does w[i-16] ^ w[i-14] ^
+ *   w[i-8], and SHA1SU1 which XOR's in w[i-3] and rotates left by 1.
+ */
+#if defined(__aarch64__) && \
+       (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 5))
+
+/*
+ * clang's arm_neon.h used to have a bug where it only defined the SHA-1
+ * intrinsics when CRYPTO (clang 12 and earlier) or SHA2 (clang 13 and 14) is
+ * enabled in the main target.  This prevents them from being used in target
+ * attribute functions.  Work around this by defining the macros ourselves.
+ */
+#if defined(__clang__) && __clang_major__ <= 15
+#  ifndef __ARM_FEATURE_CRYPTO
+#    define __ARM_FEATURE_CRYPTO 1
+#    define DEFINED_ARM_FEATURE_CRYPTO
+#  endif
+#  ifndef __ARM_FEATURE_SHA2
+#    define __ARM_FEATURE_SHA2 1
+#    define DEFINED_ARM_FEATURE_SHA2
+#  endif
+#endif
+#include <arm_neon.h>
+#ifdef DEFINED_ARM_FEATURE_CRYPTO
+#  undef __ARM_FEATURE_CRYPTO
+#endif
+#ifdef DEFINED_ARM_FEATURE_SHA2
+#  undef __ARM_FEATURE_SHA2
+#endif
  
-/* Compute SHA1 message digest for bytes read from STREAM.  The resulting
- * message digest number will be written into the 20 bytes beginning at
- * RESBLOCK.  */
-int sha1_stream(FILE * stream, void *resblock)
+/* Expands to a vector containing 4 copies of the given round's constant */
+#define SHA1_CE_K(i)           \
+       ((i) < 20 ? k0 :        \
+        (i) < 40 ? k1 :        \
+        (i) < 60 ? k2 :        \
+                   k3)
+
+/* Expands to the appropriate instruction for the given round */
+#define SHA1_CE_OP(i, abcd, e, w)                      \
+       ((i) < 20 ? vsha1cq_u32((abcd), (e), (w)) :     \
+        (i) < 40 ? vsha1pq_u32((abcd), (e), (w)) :     \
+        (i) < 60 ? vsha1mq_u32((abcd), (e), (w)) :     \
+                   vsha1pq_u32((abcd), (e), (w)))
+
+#define SHA1_CE_4ROUNDS(i, w0, w1, w2, w3, e0, e1)     \
+       tmp = w0 + SHA1_CE_K(i);                        \
+       e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));       \
+       abcd = SHA1_CE_OP((i), abcd, e0, tmp);          \
+       if ((i) >= 12 && (i) < 76)                      \
+               w1 = vsha1su1q_u32(w1, w0);             \
+       if ((i) >= 8 && (i) < 72)                       \
+               w2 = vsha1su0q_u32(w2, w3, w0);
+       /*
+        * implicit: the new (w0, w1, w2, w3) is the old (w1, w2, w3, w0),
+        * and the new (e0, e1) is the old (e1, e0)
+        */
+
+#define SHA1_CE_16ROUNDS(i)                                    \
+       SHA1_CE_4ROUNDS((i) +  0, w0, w1, w2, w3, e0, e1);      \
+       SHA1_CE_4ROUNDS((i) +  4, w1, w2, w3, w0, e1, e0);      \
+       SHA1_CE_4ROUNDS((i) +  8, w2, w3, w0, w1, e0, e1);      \
+       SHA1_CE_4ROUNDS((i) + 12, w3, w0, w1, w2, e1, e0);
+
+#define HAVE_SHA1_BLOCKS_ARM_CE
+static void
+#ifdef __clang__
+       /*
+        * clang has the SHA-1 instructions under "sha2".  "crypto" used to work
+        * too, but only in clang 15 and earlier.  So, use "sha2" here.
+        */
+       __attribute__((target("sha2")))
+#else
+       /* gcc wants "+crypto".  "+sha2" doesn't work. */
+       __attribute__((target("+crypto")))
+#endif
+sha1_blocks_arm_ce(u32 h[5], const void *data, size_t num_blocks)
  {
-       SHA_CTX ctx;
-
-       size_t sum;
-
-       char *buffer = MALLOC(BLOCKSIZE + 72);
-       if (!buffer) {
-               ERROR("Out of memory!\n");
-               return WIMLIB_ERR_NOMEM;
-       }
-
-       /* Initialize the computation context.  */
-       sha1_init_ctx(&ctx);
-
-       /* Iterate over full file contents.  */
-       while (1) {
-               /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
-                  computation function processes the whole buffer so that with the
-                  next round of the loop another block can be read.  */
-               size_t n;
-               sum = 0;
-
-               /* Read block.  Take care for partial reads.  */
-               while (1) {
-                       n = fread(buffer + sum, 1, BLOCKSIZE - sum, stream);
-
-                       sum += n;
-
-                       if (sum == BLOCKSIZE)
-                               break;
-
-                       if (n == 0) {
-                               /* Check for the error flag IFF N == 0, so that
-                                * we don't exit the loop after a partial read
-                                * due to e.g., EAGAIN or EWOULDBLOCK.  */
-                               if (ferror(stream)) {
-                                       FREE(buffer);
-                                       ERROR("Read error while calculating "
-                                               "SHA1 message digest: %m\n");
-                                       return WIMLIB_ERR_READ;
-                               }
-                               goto process_partial_block;
-                       }
-
-                       /* We've read at least one byte, so ignore errors.  But always
-                          check for EOF, since feof may be true even though N > 0.
-                          Otherwise, we could end up calling fread after EOF.  */
-                       if (feof(stream))
-                               goto process_partial_block;
-               }
-
-               /* Process buffer with BLOCKSIZE bytes.  Note that
-                  BLOCKSIZE % 64 == 0
-                */
-               sha1_process_block(buffer, BLOCKSIZE, &ctx);
-       }
-
- process_partial_block:;
-
-       /* Process any remaining bytes.  */
-       if (sum > 0)
-               sha1_process_bytes(buffer, sum, &ctx);
-
-       /* Construct result in desired memory.  */
-       sha1_finish_ctx(&ctx, resblock);
-       FREE(buffer);
-       return 0;
+       uint32x4_t h_abcd = vld1q_u32(h);
+       uint32x4_t k0 = vdupq_n_u32(SHA1_K(0));
+       uint32x4_t k1 = vdupq_n_u32(SHA1_K(20));
+       uint32x4_t k2 = vdupq_n_u32(SHA1_K(40));
+       uint32x4_t k3 = vdupq_n_u32(SHA1_K(60));
+
+       do {
+               uint32x4_t abcd = h_abcd;
+               u32 e0 = h[4], e1;
+               uint32x4_t tmp, w0, w1, w2, w3;
+
+               w0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 0)));
+               w1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 16)));
+               w2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 32)));
+               w3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 48)));
+
+               SHA1_CE_16ROUNDS(0);
+               SHA1_CE_16ROUNDS(16);
+               SHA1_CE_16ROUNDS(32);
+               SHA1_CE_16ROUNDS(48);
+               SHA1_CE_16ROUNDS(64);
+
+               h_abcd += abcd;
+               h[4] += e0;
+               data += SHA1_BLOCK_SIZE;
+       } while (--num_blocks);
+
+       vst1q_u32(h, h_abcd);
  }
+#endif /* ARMv8 Crypto Extensions implementation */
  
-#ifndef WITH_LIBCRYPTO
-/* Compute SHA1 message digest for LEN bytes beginning at BUFFER.  The
-   result is always in little endian byte order, so that a byte-wise
-   output yields to the wanted ASCII representation of the message
-   digest.  */
-void *sha1_buffer(const char *buffer, size_t len, void *resblock)
-{
-       SHA_CTX ctx;
-
-       /* Initialize the computation context.  */
-       sha1_init_ctx(&ctx);
+/*----------------------------------------------------------------------------*
+ *                              Everything else                               *
+ *----------------------------------------------------------------------------*/
  
-       /* Process whole buffer but last len % 64 bytes.  */
-       sha1_process_bytes(buffer, len, &ctx);
-
-       /* Put result in desired memory area.  */
-       return sha1_finish_ctx(&ctx, resblock);
-}
-
-/* Take a pointer to a 160 bit block of data (five 32 bit ints) and
-   initialize it to the start constants of the SHA1 algorithm.  This
-   must be called before using hash in the call to sha1_hash.  */
-static void sha1_init_ctx(SHA_CTX *ctx)
+static void
+sha1_blocks(u32 h[5], const void *data, size_t num_blocks)
  {
-       ctx->A = 0x67452301;
-       ctx->B = 0xefcdab89;
-       ctx->C = 0x98badcfe;
-       ctx->D = 0x10325476;
-       ctx->E = 0xc3d2e1f0;
-
-       ctx->total[0] = ctx->total[1] = 0;
-       ctx->buflen = 0;
+#ifdef HAVE_SHA1_BLOCKS_X86_SHA
+       if ((cpu_features & (X86_CPU_FEATURE_SHA | X86_CPU_FEATURE_SSE4_1)) ==
+           (X86_CPU_FEATURE_SHA | X86_CPU_FEATURE_SSE4_1))
+               return sha1_blocks_x86_sha(h, data, num_blocks);
+#endif
+#ifdef HAVE_SHA1_BLOCKS_X86_AVX_BMI2
+       if ((cpu_features & (X86_CPU_FEATURE_AVX | X86_CPU_FEATURE_BMI2)) ==
+           (X86_CPU_FEATURE_AVX | X86_CPU_FEATURE_BMI2))
+               return sha1_blocks_x86_avx_bmi2(h, data, num_blocks);
+#endif
+#ifdef HAVE_SHA1_BLOCKS_X86_SSSE3
+       if (cpu_features & X86_CPU_FEATURE_SSSE3)
+               return sha1_blocks_x86_ssse3(h, data, num_blocks);
+#endif
+#ifdef HAVE_SHA1_BLOCKS_ARM_CE
+       if (cpu_features & ARM_CPU_FEATURE_SHA1)
+               return sha1_blocks_arm_ce(h, data, num_blocks);
+#endif
+       return sha1_blocks_generic(h, data, num_blocks);
  }
  
-/* Copy the 4 byte value from v into the memory location pointed to by *cp,
-   If your architecture allows unaligned access this is equivalent to
-   * (uint32_t *) cp = v  */
-static inline void set_uint32(char *cp, uint32_t v)
+/*
+ * Initialize the given SHA-1 context.
+ *
+ * After sha1_init(), call sha1_update() zero or more times to provide the data
+ * to be hashed.  Then call sha1_final() to get the resulting message digest.
+ */
+void
+sha1_init(struct sha1_ctx *ctx)
  {
-       memcpy(cp, &v, sizeof v);
-}
+       ctx->bytecount = 0;
  
-/* Put result from CTX in first 20 bytes following RESBUF.  The result
-   must be in little endian byte order.  */
-static void *sha1_read_ctx(const SHA_CTX *ctx, void *resbuf)
-{
-       char *r = resbuf;
-       set_uint32(r + 0 * sizeof ctx->A, SWAP(ctx->A));
-       set_uint32(r + 1 * sizeof ctx->B, SWAP(ctx->B));
-       set_uint32(r + 2 * sizeof ctx->C, SWAP(ctx->C));
-       set_uint32(r + 3 * sizeof ctx->D, SWAP(ctx->D));
-       set_uint32(r + 4 * sizeof ctx->E, SWAP(ctx->E));
-
-       return resbuf;
+       ctx->h[0] = 0x67452301;
+       ctx->h[1] = 0xEFCDAB89;
+       ctx->h[2] = 0x98BADCFE;
+       ctx->h[3] = 0x10325476;
+       ctx->h[4] = 0xC3D2E1F0;
  }
  
-/* Process the remaining bytes in the internal buffer and the usual
-   prolog according to the standard and write the result to RESBUF.  */
-static void *sha1_finish_ctx(SHA_CTX *ctx, void *resbuf)
+/* Update the SHA-1 context with @len bytes of data. */
+void
+sha1_update(struct sha1_ctx *ctx, const void *data, size_t len)
  {
-       /* Take yet unprocessed bytes into account.  */
-       uint32_t bytes = ctx->buflen;
-       size_t size = (bytes < 56) ? 64 / 4 : 64 * 2 / 4;
+       unsigned buffered = ctx->bytecount % SHA1_BLOCK_SIZE;
+       size_t blocks;
  
-       /* Now count remaining bytes.  */
-       ctx->total[0] += bytes;
-       if (ctx->total[0] < bytes)
-               ++ctx->total[1];
+       ctx->bytecount += len;
  
-       /* Put the 64-bit file length in *bits* at the end of the buffer.  */
-       ctx->buffer[size - 2] =
-           SWAP((ctx->total[1] << 3) | (ctx->total[0] >> 29));
-       ctx->buffer[size - 1] = SWAP(ctx->total[0] << 3);
+       if (buffered) {
+               unsigned remaining = SHA1_BLOCK_SIZE - buffered;
  
-       memcpy(&((char *)ctx->buffer)[bytes], fillbuf, (size - 2) * 4 - bytes);
-
-       /* Process last bytes.  */
-       sha1_process_block(ctx->buffer, size * 4, ctx);
-
-       return sha1_read_ctx(ctx, resbuf);
-}
-
-
-static void sha1_process_bytes(const void *buffer, size_t len, SHA_CTX *ctx)
-{
-       /* When we already have some bits in our internal buffer concatenate
-          both inputs first.  */
-       if (ctx->buflen != 0) {
-               size_t left_over = ctx->buflen;
-               size_t add = 128 - left_over > len ? len : 128 - left_over;
-
-               memcpy(&((char *)ctx->buffer)[left_over], buffer, add);
-               ctx->buflen += add;
-
-               if (ctx->buflen > 64) {
-                       sha1_process_block(ctx->buffer, ctx->buflen & ~63, ctx);
-
-                       ctx->buflen &= 63;
-                       /* The regions in the following copy operation cannot overlap.  */
-                       memcpy(ctx->buffer,
-                              &((char *)ctx->buffer)[(left_over + add) & ~63],
-                              ctx->buflen);
+               if (len < remaining) {
+                       memcpy(&ctx->buffer[buffered], data, len);
+                       return;
                 }
-
-               buffer = (const char *)buffer + add;
-               len -= add;
+               memcpy(&ctx->buffer[buffered], data, remaining);
+               sha1_blocks(ctx->h, ctx->buffer, 1);
+               data += remaining;
+               len -= remaining;
         }
  
-       /* Process available complete blocks.  */
-       if (len >= 64) {
-#if !_STRING_ARCH_unaligned
-#define alignof(type) offsetof (struct { char c; type x; }, x)
-#define UNALIGNED_P(p) (((size_t) p) % alignof (uint32_t) != 0)
-               if (UNALIGNED_P(buffer))
-                       while (len > 64) {
-                               sha1_process_block(memcpy
-                                                  (ctx->buffer, buffer, 64),
-                                                  64, ctx);
-                               buffer = (const char *)buffer + 64;
-                               len -= 64;
-               } else
-#endif
-               {
-                       sha1_process_block(buffer, len & ~63, ctx);
-                       buffer = (const char *)buffer + (len & ~63);
-                       len &= 63;
-               }
+       blocks = len / SHA1_BLOCK_SIZE;
+       if (blocks) {
+               sha1_blocks(ctx->h, data, blocks);
+               data += blocks * SHA1_BLOCK_SIZE;
+               len -= blocks * SHA1_BLOCK_SIZE;
         }
  
-       /* Move remaining bytes in internal buffer.  */
-       if (len > 0) {
-               size_t left_over = ctx->buflen;
+       if (len)
+               memcpy(ctx->buffer, data, len);
+}
  
-               memcpy(&((char *)ctx->buffer)[left_over], buffer, len);
-               left_over += len;
-               if (left_over >= 64) {
-                       sha1_process_block(ctx->buffer, 64, ctx);
-                       left_over -= 64;
-                       memcpy(ctx->buffer, &ctx->buffer[16], left_over);
-               }
-               ctx->buflen = left_over;
+/* Finalize the SHA-1 operation and return the resulting message digest. */
+void
+sha1_final(struct sha1_ctx *ctx, u8 hash[SHA1_HASH_SIZE])
+{
+       unsigned buffered = ctx->bytecount % SHA1_BLOCK_SIZE;
+       const be64 bitcount = cpu_to_be64(ctx->bytecount * 8);
+
+       ctx->buffer[buffered++] = 0x80;
+       if (buffered > SHA1_BLOCK_SIZE - 8) {
+               memset(&ctx->buffer[buffered], 0, SHA1_BLOCK_SIZE - buffered);
+               sha1_blocks(ctx->h, ctx->buffer, 1);
+               buffered = 0;
         }
+       memset(&ctx->buffer[buffered], 0, SHA1_BLOCK_SIZE - 8 - buffered);
+       memcpy(&ctx->buffer[SHA1_BLOCK_SIZE - 8], &bitcount, 8);
+       sha1_blocks(ctx->h, ctx->buffer, 1);
+
+       put_unaligned_be32(ctx->h[0], &hash[0]);
+       put_unaligned_be32(ctx->h[1], &hash[4]);
+       put_unaligned_be32(ctx->h[2], &hash[8]);
+       put_unaligned_be32(ctx->h[3], &hash[12]);
+       put_unaligned_be32(ctx->h[4], &hash[16]);
  }
  
-/* --- Code below is the primary difference between md5.c and sha1.c --- */
-
-/* SHA1 round constants */
-#define K1 0x5a827999
-#define K2 0x6ed9eba1
-#define K3 0x8f1bbcdc
-#define K4 0xca62c1d6
+/* Calculate the SHA-1 message digest of the given data. */
+void
+sha1(const void *data, size_t len, u8 hash[SHA1_HASH_SIZE])
+{
+       struct sha1_ctx ctx;
  
-/* Round functions.  Note that F2 is the same as F4.  */
-#define F1(B,C,D) ( D ^ ( B & ( C ^ D ) ) )
-#define F2(B,C,D) (B ^ C ^ D)
-#define F3(B,C,D) ( ( B & C ) | ( D & ( B | C ) ) )
-#define F4(B,C,D) (B ^ C ^ D)
+       sha1_init(&ctx);
+       sha1_update(&ctx, data, len);
+       sha1_final(&ctx, hash);
+}
  
-/* Process LEN bytes of BUFFER, accumulating context into CTX.
-   It is assumed that LEN % 64 == 0.
-   Most of this code comes from GnuPG's cipher/sha1.c.  */
+/* "Null" SHA-1 message digest containing all 0's */
+const u8 zero_hash[SHA1_HASH_SIZE];
  
-#ifndef ENABLE_SSSE3_SHA1
-static void sha1_process_block(const void *buffer, size_t len, SHA_CTX *ctx)
+/* Build a hexadecimal string representation of a SHA-1 message digest. */
+void
+sprint_hash(const u8 hash[SHA1_HASH_SIZE], tchar strbuf[SHA1_HASH_STRING_LEN])
  {
-       const uint32_t *words = buffer;
-       size_t nwords = len / sizeof(uint32_t);
-       const uint32_t *endp = words + nwords;
-       uint32_t x[16];
-       uint32_t a = ctx->A;
-       uint32_t b = ctx->B;
-       uint32_t c = ctx->C;
-       uint32_t d = ctx->D;
-       uint32_t e = ctx->E;
-
-       /* First increment the byte count.  RFC 1321 specifies the possible
-          length of the file up to 2^64 bits.  Here we only compute the
-          number of bytes.  Do a double word increment.  */
-       ctx->total[0] += len;
-       if (ctx->total[0] < len)
-               ++ctx->total[1];
-
-#define rol(x, n) (((x) << (n)) | ((uint32_t) (x) >> (32 - (n))))
-
-#define M(I) ( tm =   x[I&0x0f] ^ x[(I-14)&0x0f] \
-                    ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \
-               , (x[I&0x0f] = rol(tm, 1)) )
-
-#define R(A,B,C,D,E,F,K,M)  do { E += rol( A, 5 )     \
-                                      + F( B, C, D )  \
-                                      + K             \
-                                      + M;            \
-                                 B = rol( B, 30 );    \
-                               } while(0)
-
-       while (words < endp) {
-               uint32_t tm;
-               int t;
-               for (t = 0; t < 16; t++) {
-                       x[t] = SWAP(*words);
-                       words++;
-               }
-
-               R(a, b, c, d, e, F1, K1, x[0]);
-               R(e, a, b, c, d, F1, K1, x[1]);
-               R(d, e, a, b, c, F1, K1, x[2]);
-               R(c, d, e, a, b, F1, K1, x[3]);
-               R(b, c, d, e, a, F1, K1, x[4]);
-               R(a, b, c, d, e, F1, K1, x[5]);
-               R(e, a, b, c, d, F1, K1, x[6]);
-               R(d, e, a, b, c, F1, K1, x[7]);
-               R(c, d, e, a, b, F1, K1, x[8]);
-               R(b, c, d, e, a, F1, K1, x[9]);
-               R(a, b, c, d, e, F1, K1, x[10]);
-               R(e, a, b, c, d, F1, K1, x[11]);
-               R(d, e, a, b, c, F1, K1, x[12]);
-               R(c, d, e, a, b, F1, K1, x[13]);
-               R(b, c, d, e, a, F1, K1, x[14]);
-               R(a, b, c, d, e, F1, K1, x[15]);
-               R(e, a, b, c, d, F1, K1, M(16));
-               R(d, e, a, b, c, F1, K1, M(17));
-               R(c, d, e, a, b, F1, K1, M(18));
-               R(b, c, d, e, a, F1, K1, M(19));
-               R(a, b, c, d, e, F2, K2, M(20));
-               R(e, a, b, c, d, F2, K2, M(21));
-               R(d, e, a, b, c, F2, K2, M(22));
-               R(c, d, e, a, b, F2, K2, M(23));
-               R(b, c, d, e, a, F2, K2, M(24));
-               R(a, b, c, d, e, F2, K2, M(25));
-               R(e, a, b, c, d, F2, K2, M(26));
-               R(d, e, a, b, c, F2, K2, M(27));
-               R(c, d, e, a, b, F2, K2, M(28));
-               R(b, c, d, e, a, F2, K2, M(29));
-               R(a, b, c, d, e, F2, K2, M(30));
-               R(e, a, b, c, d, F2, K2, M(31));
-               R(d, e, a, b, c, F2, K2, M(32));
-               R(c, d, e, a, b, F2, K2, M(33));
-               R(b, c, d, e, a, F2, K2, M(34));
-               R(a, b, c, d, e, F2, K2, M(35));
-               R(e, a, b, c, d, F2, K2, M(36));
-               R(d, e, a, b, c, F2, K2, M(37));
-               R(c, d, e, a, b, F2, K2, M(38));
-               R(b, c, d, e, a, F2, K2, M(39));
-               R(a, b, c, d, e, F3, K3, M(40));
-               R(e, a, b, c, d, F3, K3, M(41));
-               R(d, e, a, b, c, F3, K3, M(42));
-               R(c, d, e, a, b, F3, K3, M(43));
-               R(b, c, d, e, a, F3, K3, M(44));
-               R(a, b, c, d, e, F3, K3, M(45));
-               R(e, a, b, c, d, F3, K3, M(46));
-               R(d, e, a, b, c, F3, K3, M(47));
-               R(c, d, e, a, b, F3, K3, M(48));
-               R(b, c, d, e, a, F3, K3, M(49));
-               R(a, b, c, d, e, F3, K3, M(50));
-               R(e, a, b, c, d, F3, K3, M(51));
-               R(d, e, a, b, c, F3, K3, M(52));
-               R(c, d, e, a, b, F3, K3, M(53));
-               R(b, c, d, e, a, F3, K3, M(54));
-               R(a, b, c, d, e, F3, K3, M(55));
-               R(e, a, b, c, d, F3, K3, M(56));
-               R(d, e, a, b, c, F3, K3, M(57));
-               R(c, d, e, a, b, F3, K3, M(58));
-               R(b, c, d, e, a, F3, K3, M(59));
-               R(a, b, c, d, e, F4, K4, M(60));
-               R(e, a, b, c, d, F4, K4, M(61));
-               R(d, e, a, b, c, F4, K4, M(62));
-               R(c, d, e, a, b, F4, K4, M(63));
-               R(b, c, d, e, a, F4, K4, M(64));
-               R(a, b, c, d, e, F4, K4, M(65));
-               R(e, a, b, c, d, F4, K4, M(66));
-               R(d, e, a, b, c, F4, K4, M(67));
-               R(c, d, e, a, b, F4, K4, M(68));
-               R(b, c, d, e, a, F4, K4, M(69));
-               R(a, b, c, d, e, F4, K4, M(70));
-               R(e, a, b, c, d, F4, K4, M(71));
-               R(d, e, a, b, c, F4, K4, M(72));
-               R(c, d, e, a, b, F4, K4, M(73));
-               R(b, c, d, e, a, F4, K4, M(74));
-               R(a, b, c, d, e, F4, K4, M(75));
-               R(e, a, b, c, d, F4, K4, M(76));
-               R(d, e, a, b, c, F4, K4, M(77));
-               R(c, d, e, a, b, F4, K4, M(78));
-               R(b, c, d, e, a, F4, K4, M(79));
-
-               a = ctx->A += a;
-               b = ctx->B += b;
-               c = ctx->C += c;
-               d = ctx->D += d;
-               e = ctx->E += e;
+       int i;
+       u8 high, low;
+
+       for (i = 0; i < SHA1_HASH_SIZE; i++) {
+               high = hash[i] >> 4;
+               low = hash[i] & 0xF;
+               strbuf[i * 2 + 0] = (high < 10 ? high + '0' : high - 10 + 'a');
+               strbuf[i * 2 + 1] = (low  < 10 ? low  + '0' : low  - 10 + 'a');
         }
+       strbuf[i * 2] = 0;
  }
-#endif /* ENABLE_SSSE3_SHA1 */
-
-#endif /* WITH_LIBCRYPTO */