wimlib.net Git - wimlib/blob - src/sha1.c

   1 /*
   2  * sha1.c - implementation of the Secure Hash Algorithm version 1 (FIPS 180-1)
   3  *
   4  * Copyright 2022-2023 Eric Biggers
   5  *
   6  * Permission is hereby granted, free of charge, to any person
   7  * obtaining a copy of this software and associated documentation
   8  * files (the "Software"), to deal in the Software without
   9  * restriction, including without limitation the rights to use,
  10  * copy, modify, merge, publish, distribute, sublicense, and/or sell
  11  * copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following
  13  * conditions:
  14  *
  15  * The above copyright notice and this permission notice shall be
  16  * included in all copies or substantial portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  20  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  21  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  22  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  23  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  25  * OTHER DEALINGS IN THE SOFTWARE.
  26  */
  27
  28 #ifdef HAVE_CONFIG_H
  29 #  include "config.h"
  30 #endif
  31
  32 #include "wimlib/cpu_features.h"
  33 #include "wimlib/endianness.h"
  34 #include "wimlib/sha1.h"
  35 #include "wimlib/unaligned.h"
  36
  37 /*----------------------------------------------------------------------------*
  38  *                              Shared helpers                                *
  39  *----------------------------------------------------------------------------*/
  40
  41 static inline u32
  42 rol32(u32 v, int bits)
  43 {
  44         return (v << bits) | (v >> (32 - bits));
  45 }
  46
  47 /* Expands to the round constant for the given round */
  48 #define SHA1_K(i)                       \
  49         (((i) < 20) ? 0x5A827999 :      \
  50          ((i) < 40) ? 0x6ED9EBA1 :      \
  51          ((i) < 60) ? 0x8F1BBCDC :      \
  52                       0xCA62C1D6)
  53
  54 /* Expands to the computation on b, c, and d for the given round */
  55 #define SHA1_F(i, b, c, d)                                      \
  56         (((i) < 20) ? /* Choice */ (b & (c ^ d)) ^ d :          \
  57          ((i) < 40) ? /* Parity */ b ^ c ^ d :                  \
  58          ((i) < 60) ? /* Majority */ (c & d) ^ (b & (c ^ d)) :  \
  59                       /* Parity */ b ^ c ^ d)
  60
  61 /*
  62  * Expands to a memory barrier for the given array, preventing values of the
  63  * array from being cached in registers past the barrier.  Use this to prevent
  64  * the compiler from making counter-productive optimizations when there aren't
  65  * enough registers available to hold the full array.
  66  */
  67 #define FORCE_NOT_CACHED(array) asm volatile("" : "+m" (array))
  68
  69 /*
  70  * Expands to FORCE_NOT_CACHED() if the architecture has 16 or fewer general
  71  * purpose registers, otherwise does nothing.
  72  */
  73 #if defined(__i386__) || defined(__x86_64__) || defined(__arm__)
  74 #  define FORCE_NOT_CACHED_IF_FEW_REGS(array)   FORCE_NOT_CACHED(array)
  75 #else
  76 #  define FORCE_NOT_CACHED_IF_FEW_REGS(array)   (void)(array)
  77 #endif
  78
  79 /*----------------------------------------------------------------------------*
  80  *                         Generic implementation                             *
  81  *----------------------------------------------------------------------------*/
  82
  83 /*
  84  * This is SHA-1 in portable C code.  It computes the message schedule
  85  * just-in-time, in a rolling window of length 16.
  86  */
  87
  88 #define SHA1_GENERIC_ROUND(i, a, b, c, d, e)                            \
  89         FORCE_NOT_CACHED_IF_FEW_REGS(w);                                \
  90         if ((i) < 16)                                                   \
  91                 w[i] = get_unaligned_be32(data + ((i) * 4));            \
  92         else                                                            \
  93                 w[(i) % 16] = rol32(w[((i) - 16) % 16] ^                \
  94                                     w[((i) - 14) % 16] ^                \
  95                                     w[((i) -  8) % 16] ^                \
  96                                     w[((i) -  3) % 16], 1);             \
  97         e += w[(i) % 16] + rol32(a, 5) + SHA1_F((i), b, c, d) + SHA1_K(i); \
  98         b = rol32(b, 30);
  99         /* implicit: the new (a, b, c, d, e) is the old (e, a, b, c, d) */
 100
 101 #define SHA1_GENERIC_5ROUNDS(i)                         \
 102         SHA1_GENERIC_ROUND((i) + 0, a, b, c, d, e);     \
 103         SHA1_GENERIC_ROUND((i) + 1, e, a, b, c, d);     \
 104         SHA1_GENERIC_ROUND((i) + 2, d, e, a, b, c);     \
 105         SHA1_GENERIC_ROUND((i) + 3, c, d, e, a, b);     \
 106         SHA1_GENERIC_ROUND((i) + 4, b, c, d, e, a);
 107
 108 #define SHA1_GENERIC_20ROUNDS(i)        \
 109         SHA1_GENERIC_5ROUNDS((i) +  0); \
 110         SHA1_GENERIC_5ROUNDS((i) +  5); \
 111         SHA1_GENERIC_5ROUNDS((i) + 10); \
 112         SHA1_GENERIC_5ROUNDS((i) + 15);
 113
 114 static void
 115 sha1_blocks_generic(u32 h[5], const void *data, size_t num_blocks)
 116 {
 117         do {
 118                 u32 a = h[0];
 119                 u32 b = h[1];
 120                 u32 c = h[2];
 121                 u32 d = h[3];
 122                 u32 e = h[4];
 123                 u32 w[16];
 124
 125                 SHA1_GENERIC_20ROUNDS(0);
 126                 SHA1_GENERIC_20ROUNDS(20);
 127                 SHA1_GENERIC_20ROUNDS(40);
 128                 SHA1_GENERIC_20ROUNDS(60);
 129
 130                 h[0] += a;
 131                 h[1] += b;
 132                 h[2] += c;
 133                 h[3] += d;
 134                 h[4] += e;
 135                 data += SHA1_BLOCK_SIZE;
 136         } while (--num_blocks);
 137 }
 138
 139 /*----------------------------------------------------------------------------*
 140  *                    x86 SSSE3 (and AVX+BMI2) implementation                 *
 141  *----------------------------------------------------------------------------*/
 142
 143 /*
 144  * This is SHA-1 using the x86 SSSE3 instructions.  A copy of it is also
 145  * compiled with AVX and BMI2 code generation enabled for improved performance.
 146  *
 147  * Unfortunately this isn't actually much faster than the generic
 148  * implementation, since only the message schedule can be vectorized, not the
 149  * SHA itself.  The vectorized computation of the message schedule is
 150  * interleaved with the scalar computation of the SHA itself.
 151  *
 152  * Specifically, 16 rounds ahead of time, the words of the message schedule are
 153  * calculated, the round constants are added to them, and they are stored in a
 154  * temporary array that the scalar code reads from later.  This is done 4 words
 155  * at a time, but split into 4 steps, so that one step is executed during each
 156  * round.  Rounds 16-31 use the usual formula 'w[i] = rol32(w[i-16] ^ w[i-14] ^
 157  * w[i-8] ^ w[i-3], 1)', while rounds 32-79 use the equivalent formula 'w[i] =
 158  * rol32(w[i-32] ^ w[i-28] ^ w[i-16] ^ w[i-6], 2)' for improved vectorization.
 159  *
 160  * During rounds 80-95, the first 16 message schedule words for the next block
 161  * are prepared.
 162  */
 163 #if defined(__i386__) || defined(__x86_64__)
 164 #include <immintrin.h>
 165
 166 #define SHA1_SSSE3_PRECALC(i, w0, w1, w2, w3, w4, w5, w6, w7)           \
 167         if ((i) % 20 == 0)                                              \
 168                 k = _mm_set1_epi32(SHA1_K((i) % 80));                   \
 169         if ((i) < 32) {                                                 \
 170                 /*
 171                  * Vectorized computation of w[i] = rol32(w[i-16] ^ w[i-14] ^
 172                  * w[i-8] ^ w[i-3], 1) for i...i+3, split into 4 steps.
 173                  * w[i-16..i+3] are in (w0, w1, w2, w3, w4).
 174                  */                                                     \
 175                 if ((i) % 4 == 0) {                                     \
 176                         w4 = _mm_alignr_epi8(w1, w0, 8) ^ w2;           \
 177                         t0 = _mm_srli_si128(w3, 4);                     \
 178                 } else if ((i) % 4 == 1) {                              \
 179                         t0 ^= w4 ^ w0;                                  \
 180                         t1 = _mm_slli_si128(t0, 12);                    \
 181                 } else if ((i) % 4 == 2) {                              \
 182                         t2 = _mm_slli_epi32(t1, 2);                     \
 183                         w4 = _mm_slli_epi32(t0, 1);                     \
 184                         t0 = _mm_srli_epi32(t0, 31);                    \
 185                         t2 ^= _mm_srli_epi32(t1, 30);                   \
 186                 } else {                                                \
 187                         w4 ^= t0 ^ t2;                                  \
 188                         t0 = _mm_add_epi32(w4, k);                      \
 189                         _mm_store_si128((__m128i *)&tmp[((i) - 3) % 16], t0);   \
 190                 }                                                       \
 191         } else if ((i) < 80) {                                          \
 192                 /*
 193                  * Vectorized computation of w[i] = rol32(w[i-32] ^ w[i-28] ^
 194                  * w[i-16] ^ w[i-6], 2) for i...i+3, split into 4 steps.
 195                  * w[i-32..i+3] are in (w4, w5, w6, w7, w0, w1, w2, w3, w4);
 196                  * note the reuse of w4.
 197                  */                                                     \
 198                 if ((i) % 4 == 0)                                       \
 199                         w4 ^= _mm_alignr_epi8(w3, w2, 8);               \
 200                 else if ((i) % 4 == 1)                                  \
 201                         w4 ^= w5 ^ w0;                                  \
 202                 else if ((i) % 4 == 2)                                  \
 203                         w4 = _mm_slli_epi32(w4, 2) ^                    \
 204                              _mm_srli_epi32(w4, 30);                    \
 205                 else                                                    \
 206                         _mm_store_si128((__m128i *)&tmp[((i) - 3) % 16],\
 207                                         _mm_add_epi32(w4, k));          \
 208         } else if ((i) < 96) {                                          \
 209                 /* Precomputation of w[0..15] for next block */         \
 210                 if ((i) == 80 && --num_blocks != 0)                     \
 211                         data += SHA1_BLOCK_SIZE;                        \
 212                 if ((i) % 4 == 0)                                       \
 213                         w0 = _mm_loadu_si128(data + (((i) - 80) * 4));  \
 214                 else if ((i) % 4 == 1)                                  \
 215                         w0 = _mm_shuffle_epi8(w0, bswap32_mask);        \
 216                 else if ((i) % 4 == 2)                                  \
 217                         t0 = _mm_add_epi32(w0, k);                      \
 218                 else                                                    \
 219                         _mm_store_si128((__m128i *)&tmp[(i) - 83], t0); \
 220         }
 221
 222 #define SHA1_SSSE3_2ROUNDS(i, a, b, c, d, e, w0, w1, w2, w3, w4, w5, w6, w7) \
 223         FORCE_NOT_CACHED(tmp);                                          \
 224         e += tmp[(i) % 16] + rol32(a, 5) + SHA1_F((i), b, c, d);        \
 225         b = rol32(b, 30);                                               \
 226         SHA1_SSSE3_PRECALC((i) + 16, w0, w1, w2, w3, w4, w5, w6, w7);   \
 227         FORCE_NOT_CACHED(tmp);                                          \
 228         d += tmp[((i) + 1) % 16] + rol32(e, 5) + SHA1_F((i) + 1, a, b, c); \
 229         SHA1_SSSE3_PRECALC((i) + 17, w0, w1, w2, w3, w4, w5, w6, w7);   \
 230         a = rol32(a, 30);
 231         /* implicit: the new (a, b, c, d, e) is the old (d, e, a, b, c) */
 232
 233 #define SHA1_SSSE3_4ROUNDS(i, a, b, c, d, e, w0, w1, w2, w3, w4, w5, w6, w7)    \
 234         SHA1_SSSE3_2ROUNDS((i) + 0, a, b, c, d, e, w0, w1, w2, w3, w4, w5, w6, w7); \
 235         SHA1_SSSE3_2ROUNDS((i) + 2, d, e, a, b, c, w0, w1, w2, w3, w4, w5, w6, w7); \
 236         /*
 237          * implicit: the new (w0-w7) is the old (w1-w7,w0),
 238          * and the new (a, b, c, d, e) is the old (b, c, d, e, a)
 239          */
 240
 241 #define SHA1_SSSE3_20ROUNDS(i, w0, w1, w2, w3, w4, w5, w6, w7)          \
 242         SHA1_SSSE3_4ROUNDS((i) +  0, a, b, c, d, e, w0, w1, w2, w3, w4, w5, w6, w7); \
 243         SHA1_SSSE3_4ROUNDS((i) +  4, b, c, d, e, a, w1, w2, w3, w4, w5, w6, w7, w0); \
 244         SHA1_SSSE3_4ROUNDS((i) +  8, c, d, e, a, b, w2, w3, w4, w5, w6, w7, w0, w1); \
 245         SHA1_SSSE3_4ROUNDS((i) + 12, d, e, a, b, c, w3, w4, w5, w6, w7, w0, w1, w2); \
 246         SHA1_SSSE3_4ROUNDS((i) + 16, e, a, b, c, d, w4, w5, w6, w7, w0, w1, w2, w3);
 247         /* implicit: the new (w0-w7) is the old (w5-w7,w0-w4) */
 248
 249 #define SHA1_SSSE3_BODY                                                 \
 250         const __m128i bswap32_mask =                                    \
 251                 _mm_setr_epi8( 3,  2,  1,  0,  7,  6,  5,  4,           \
 252                               11, 10,  9,  8, 15, 14, 13, 12);          \
 253         __m128i w0, w1, w2, w3, w4, w5, w6, w7;                         \
 254         __m128i k = _mm_set1_epi32(SHA1_K(0));                          \
 255         u32 tmp[16] __attribute__((aligned(16)));                       \
 256                                                                         \
 257         w0 = _mm_shuffle_epi8(_mm_loadu_si128(data +  0), bswap32_mask); \
 258         w1 = _mm_shuffle_epi8(_mm_loadu_si128(data + 16), bswap32_mask); \
 259         w2 = _mm_shuffle_epi8(_mm_loadu_si128(data + 32), bswap32_mask); \
 260         w3 = _mm_shuffle_epi8(_mm_loadu_si128(data + 48), bswap32_mask); \
 261         _mm_store_si128((__m128i *)&tmp[0], _mm_add_epi32(w0, k));      \
 262         _mm_store_si128((__m128i *)&tmp[4], _mm_add_epi32(w1, k));      \
 263         _mm_store_si128((__m128i *)&tmp[8], _mm_add_epi32(w2, k));      \
 264         _mm_store_si128((__m128i *)&tmp[12], _mm_add_epi32(w3, k));     \
 265                                                                         \
 266         do {                                                            \
 267                 u32 a = h[0];                                           \
 268                 u32 b = h[1];                                           \
 269                 u32 c = h[2];                                           \
 270                 u32 d = h[3];                                           \
 271                 u32 e = h[4];                                           \
 272                 __m128i t0, t1, t2;                                     \
 273                                                                         \
 274                 SHA1_SSSE3_20ROUNDS(0, w0, w1, w2, w3, w4, w5, w6, w7); \
 275                 SHA1_SSSE3_20ROUNDS(20, w5, w6, w7, w0, w1, w2, w3, w4); \
 276                 SHA1_SSSE3_20ROUNDS(40, w2, w3, w4, w5, w6, w7, w0, w1); \
 277                 SHA1_SSSE3_20ROUNDS(60, w7, w0, w1, w2, w3, w4, w5, w6); \
 278                                                                         \
 279                 h[0] += a;                                              \
 280                 h[1] += b;                                              \
 281                 h[2] += c;                                              \
 282                 h[3] += d;                                              \
 283                 h[4] += e;                                              \
 284                                                                         \
 285                 /* 'data' and 'num_blocks' were updated at start of round 64. */ \
 286         } while (num_blocks);
 287
 288 #define HAVE_SHA1_BLOCKS_X86_SSSE3
 289 static void __attribute__((target("ssse3")))
 290 sha1_blocks_x86_ssse3(u32 h[5], const void *data, size_t num_blocks)
 291 {
 292         SHA1_SSSE3_BODY;
 293 }
 294
 295 #define HAVE_SHA1_BLOCKS_X86_AVX_BMI2
 296 static void __attribute__((target("avx,bmi2")))
 297 sha1_blocks_x86_avx_bmi2(u32 h[5], const void *data, size_t num_blocks)
 298 {
 299         SHA1_SSSE3_BODY;
 300 }
 301 #endif /* x86 SSSE3 (and AVX+BMI2) implementation */
 302
 303 /*----------------------------------------------------------------------------*
 304  *                        x86 SHA Extensions implementation                   *
 305  *----------------------------------------------------------------------------*/
 306
 307 /*
 308  * This is SHA-1 using the x86 SHA extensions.
 309  *
 310  * The SHA1RNDS4 instruction does most of the work.  It takes in a 128-bit
 311  * vector containing 'a', 'b', 'c', and 'd' (high-order to low-order), a 128-bit
 312  * vector containing the next 4 words of the message schedule with 'e' added to
 313  * the high-order word, and an immediate that identifies the current 20-round
 314  * section.  It does 4 rounds and updates 'a', 'b', 'c', and 'd' accordingly.
 315  *
 316  * Each SHA1RNDS4 is paired with SHA1NEXTE.  It takes in the abcd vector,
 317  * calculates the value of 'e' after 4 rounds, and adds it to the high-order
 318  * word of a vector that contains the next 4 words of the message schedule.
 319  *
 320  * Each 4 words of the message schedule for rounds 16-79 is calculated as
 321  * rol32(w[i-16] ^ w[i-14] ^ w[i-8] ^ w[i-3], 1) in three steps using the
 322  * SHA1MSG1, PXOR, and SHA1MSG2 instructions.  This happens in a rolling window,
 323  * so during the j'th set of 4 rounds we do the SHA1MSG2 step for j+1'th set of
 324  * message schedule words, PXOR for j+2'th set, and SHA1MSG1 for the j+3'th set.
 325  */
 326 #if defined(__i386__) || defined(__x86_64__)
 327 #include <immintrin.h>
 328
 329 #define SHA1_NI_4ROUNDS(i, w0, w1, w2, w3, we0, we1)                    \
 330         if ((i) < 16)                                                   \
 331                 w0 = _mm_shuffle_epi8(                                  \
 332                         _mm_loadu_si128(data + ((i) * 4)), bswap_mask); \
 333         if ((i) == 0)                                                   \
 334                 we0 = _mm_add_epi32(h_e, w0);                           \
 335         else                                                            \
 336                 we0 = _mm_sha1nexte_epu32(/* old abcd */ we0, w0);      \
 337         we1 = abcd;                                                     \
 338         if ((i) >= 12 && (i) < 76)                                      \
 339                 w1 = _mm_sha1msg2_epu32(w1, w0);                        \
 340         abcd = _mm_sha1rnds4_epu32(abcd, we0, (i) / 20);                \
 341         if ((i) >= 8 && (i) < 72)                                       \
 342                 w2 ^= w0;                                               \
 343         if ((i) >= 4 && (i) < 68)                                       \
 344                 w3 = _mm_sha1msg1_epu32(w3, w0);                        \
 345         /*
 346          * implicit: the new (w0, w1, w2, w3) is the old (w1, w2, w3, w0),
 347          * and the new (we0, we1) is the old (we1, we0)
 348          */
 349
 350 #define SHA1_NI_16ROUNDS(i)                                     \
 351         SHA1_NI_4ROUNDS((i) +  0, w0, w1, w2, w3, we0, we1);    \
 352         SHA1_NI_4ROUNDS((i) +  4, w1, w2, w3, w0, we1, we0);    \
 353         SHA1_NI_4ROUNDS((i) +  8, w2, w3, w0, w1, we0, we1);    \
 354         SHA1_NI_4ROUNDS((i) + 12, w3, w0, w1, w2, we1, we0);
 355
 356 #define HAVE_SHA1_BLOCKS_X86_SHA
 357 static void __attribute__((target("sha,sse4.1")))
 358 sha1_blocks_x86_sha(u32 h[5], const void *data, size_t num_blocks)
 359 {
 360         const __m128i bswap_mask =
 361                 _mm_setr_epi8(15, 14, 13, 12, 11, 10,  9,  8,
 362                               7,  6,   5,  4,  3,  2,  1,  0);
 363         __m128i h_abcd = _mm_shuffle_epi32(
 364                                 _mm_loadu_si128((__m128i *)h), 0x1B);
 365         __m128i h_e = _mm_setr_epi32(0, 0, 0, h[4]);
 366
 367         do {
 368                 __m128i abcd = h_abcd;
 369                 __m128i w0, w1, w2, w3, we0, we1;
 370
 371                 SHA1_NI_16ROUNDS(0);
 372                 SHA1_NI_16ROUNDS(16);
 373                 SHA1_NI_16ROUNDS(32);
 374                 SHA1_NI_16ROUNDS(48);
 375                 SHA1_NI_16ROUNDS(64);
 376
 377                 h_abcd = _mm_add_epi32(h_abcd, abcd);
 378                 h_e = _mm_sha1nexte_epu32(we0, h_e);
 379                 data += SHA1_BLOCK_SIZE;
 380         } while (--num_blocks);
 381
 382         _mm_storeu_si128((__m128i *)h, _mm_shuffle_epi32(h_abcd, 0x1B));
 383         h[4] = _mm_extract_epi32(h_e, 3);
 384 }
 385 #endif /* x86 SHA Extensions implementation */
 386
 387 /*----------------------------------------------------------------------------*
 388  *                     ARMv8 Crypto Extensions implementation                 *
 389  *----------------------------------------------------------------------------*/
 390
 391 /*
 392  * This is SHA-1 using the ARMv8 Crypto Extensions.
 393  *
 394  * This does 4 rounds at a time, and it works very similarily to the x86 SHA
 395  * Extensions implementation.  The differences are fairly minor:
 396  *
 397  * - x86 has SHA1RNDS4 that takes an immediate that identifies the set of 20
 398  *   rounds, and it handles adding the round constants.  ARM has SHA1C for
 399  *   rounds 0-19, SHA1P for rounds 20-39 and 60-79, and SHA1M for rounds 40-59.
 400  *   These don't add the round constants, so that must be done separately.
 401  *
 402  * - ARM needs only two instructions, instead of x86's three, to prepare each
 403  *   set of 4 message schedule words: SHA1SU0 which does w[i-16] ^ w[i-14] ^
 404  *   w[i-8], and SHA1SU1 which XOR's in w[i-3] and rotates left by 1.
 405  */
 406 #if defined(__aarch64__) && \
 407         (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 5))
 408
 409 /*
 410  * With clang 15.0 and earlier, arm_neon.h has a bug where it only defines the
 411  * SHA-1 intrinsics when SHA-2 is enabled in the main target.  This prevents
 412  * them from being used in target attribute functions.  Work around this by
 413  * defining __ARM_FEATURE_SHA2.
 414  *
 415  * And yes, the feature it wants is indeed *SHA-2*, not SHA-1.
 416  */
 417 #if defined(__clang__) && __clang_major__ <= 15 && !defined(__ARM_FEATURE_SHA2)
 418 #  define __ARM_FEATURE_SHA2 1
 419 #  define USED_SHA2_FEATURE_WORKAROUND
 420 #endif
 421 #include <arm_neon.h>
 422 #ifdef USED_SHA2_FEATURE_WORKAROUND
 423 #  undef __ARM_FEATURE_SHA2
 424 #endif
 425
 426 /* Expands to a vector containing 4 copies of the given round's constant */
 427 #define SHA1_CE_K(i)            \
 428         ((i) < 20 ? k0 :        \
 429          (i) < 40 ? k1 :        \
 430          (i) < 60 ? k2 :        \
 431                     k3)
 432
 433 /* Expands to the appropriate instruction for the given round */
 434 #define SHA1_CE_OP(i, abcd, e, w)                       \
 435         ((i) < 20 ? vsha1cq_u32((abcd), (e), (w)) :     \
 436          (i) < 40 ? vsha1pq_u32((abcd), (e), (w)) :     \
 437          (i) < 60 ? vsha1mq_u32((abcd), (e), (w)) :     \
 438                     vsha1pq_u32((abcd), (e), (w)))
 439
 440 #define SHA1_CE_4ROUNDS(i, w0, w1, w2, w3, e0, e1)      \
 441         tmp = w0 + SHA1_CE_K(i);                        \
 442         e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0));       \
 443         abcd = SHA1_CE_OP((i), abcd, e0, tmp);          \
 444         if ((i) >= 12 && (i) < 76)                      \
 445                 w1 = vsha1su1q_u32(w1, w0);             \
 446         if ((i) >= 8 && (i) < 72)                       \
 447                 w2 = vsha1su0q_u32(w2, w3, w0);
 448         /*
 449          * implicit: the new (w0, w1, w2, w3) is the old (w1, w2, w3, w0),
 450          * and the new (e0, e1) is the old (e1, e0)
 451          */
 452
 453 #define SHA1_CE_16ROUNDS(i)                                     \
 454         SHA1_CE_4ROUNDS((i) +  0, w0, w1, w2, w3, e0, e1);      \
 455         SHA1_CE_4ROUNDS((i) +  4, w1, w2, w3, w0, e1, e0);      \
 456         SHA1_CE_4ROUNDS((i) +  8, w2, w3, w0, w1, e0, e1);      \
 457         SHA1_CE_4ROUNDS((i) + 12, w3, w0, w1, w2, e1, e0);
 458
 459 #define HAVE_SHA1_BLOCKS_ARM_CE
 460 static void
 461 #ifdef __clang__
 462         /*
 463          * clang has the SHA-1 instructions under "sha2".  "crypto" used to work
 464          * too, but only in clang 15 and earlier.  So, use "sha2" here.
 465          */
 466         __attribute__((target("sha2")))
 467 #else
 468         /* gcc wants "+crypto".  "+sha2" doesn't work. */
 469         __attribute__((target("+crypto")))
 470 #endif
 471 sha1_blocks_arm_ce(u32 h[5], const void *data, size_t num_blocks)
 472 {
 473         uint32x4_t h_abcd = vld1q_u32(h);
 474         uint32x4_t k0 = vdupq_n_u32(SHA1_K(0));
 475         uint32x4_t k1 = vdupq_n_u32(SHA1_K(20));
 476         uint32x4_t k2 = vdupq_n_u32(SHA1_K(40));
 477         uint32x4_t k3 = vdupq_n_u32(SHA1_K(60));
 478
 479         do {
 480                 uint32x4_t abcd = h_abcd;
 481                 u32 e0 = h[4], e1;
 482                 uint32x4_t tmp, w0, w1, w2, w3;
 483
 484                 w0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 0)));
 485                 w1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 16)));
 486                 w2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 32)));
 487                 w3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 48)));
 488
 489                 SHA1_CE_16ROUNDS(0);
 490                 SHA1_CE_16ROUNDS(16);
 491                 SHA1_CE_16ROUNDS(32);
 492                 SHA1_CE_16ROUNDS(48);
 493                 SHA1_CE_16ROUNDS(64);
 494
 495                 h_abcd += abcd;
 496                 h[4] += e0;
 497                 data += SHA1_BLOCK_SIZE;
 498         } while (--num_blocks);
 499
 500         vst1q_u32(h, h_abcd);
 501 }
 502 #endif /* ARMv8 Crypto Extensions implementation */
 503
 504 /*----------------------------------------------------------------------------*
 505  *                              Everything else                               *
 506  *----------------------------------------------------------------------------*/
 507
 508 static void
 509 sha1_blocks(u32 h[5], const void *data, size_t num_blocks)
 510 {
 511 #ifdef HAVE_SHA1_BLOCKS_X86_SHA
 512         if ((cpu_features & (X86_CPU_FEATURE_SHA | X86_CPU_FEATURE_SSE4_1)) ==
 513             (X86_CPU_FEATURE_SHA | X86_CPU_FEATURE_SSE4_1))
 514                 return sha1_blocks_x86_sha(h, data, num_blocks);
 515 #endif
 516 #ifdef HAVE_SHA1_BLOCKS_X86_AVX_BMI2
 517         if ((cpu_features & (X86_CPU_FEATURE_AVX | X86_CPU_FEATURE_BMI2)) ==
 518             (X86_CPU_FEATURE_AVX | X86_CPU_FEATURE_BMI2))
 519                 return sha1_blocks_x86_avx_bmi2(h, data, num_blocks);
 520 #endif
 521 #ifdef HAVE_SHA1_BLOCKS_X86_SSSE3
 522         if (cpu_features & X86_CPU_FEATURE_SSSE3)
 523                 return sha1_blocks_x86_ssse3(h, data, num_blocks);
 524 #endif
 525 #ifdef HAVE_SHA1_BLOCKS_ARM_CE
 526         if (cpu_features & ARM_CPU_FEATURE_SHA1)
 527                 return sha1_blocks_arm_ce(h, data, num_blocks);
 528 #endif
 529         return sha1_blocks_generic(h, data, num_blocks);
 530 }
 531
 532 /*
 533  * Initialize the given SHA-1 context.
 534  *
 535  * After sha1_init(), call sha1_update() zero or more times to provide the data
 536  * to be hashed.  Then call sha1_final() to get the resulting message digest.
 537  */
 538 void
 539 sha1_init(struct sha1_ctx *ctx)
 540 {
 541         ctx->bytecount = 0;
 542
 543         ctx->h[0] = 0x67452301;
 544         ctx->h[1] = 0xEFCDAB89;
 545         ctx->h[2] = 0x98BADCFE;
 546         ctx->h[3] = 0x10325476;
 547         ctx->h[4] = 0xC3D2E1F0;
 548 }
 549
 550 /* Update the SHA-1 context with @len bytes of data. */
 551 void
 552 sha1_update(struct sha1_ctx *ctx, const void *data, size_t len)
 553 {
 554         unsigned buffered = ctx->bytecount % SHA1_BLOCK_SIZE;
 555         size_t blocks;
 556
 557         ctx->bytecount += len;
 558
 559         if (buffered) {
 560                 unsigned remaining = SHA1_BLOCK_SIZE - buffered;
 561
 562                 if (len < remaining) {
 563                         memcpy(&ctx->buffer[buffered], data, len);
 564                         return;
 565                 }
 566                 memcpy(&ctx->buffer[buffered], data, remaining);
 567                 sha1_blocks(ctx->h, ctx->buffer, 1);
 568                 data += remaining;
 569                 len -= remaining;
 570         }
 571
 572         blocks = len / SHA1_BLOCK_SIZE;
 573         if (blocks) {
 574                 sha1_blocks(ctx->h, data, blocks);
 575                 data += blocks * SHA1_BLOCK_SIZE;
 576                 len -= blocks * SHA1_BLOCK_SIZE;
 577         }
 578
 579         if (len)
 580                 memcpy(ctx->buffer, data, len);
 581 }
 582
 583 /* Finalize the SHA-1 operation and return the resulting message digest. */
 584 void
 585 sha1_final(struct sha1_ctx *ctx, u8 hash[SHA1_HASH_SIZE])
 586 {
 587         unsigned buffered = ctx->bytecount % SHA1_BLOCK_SIZE;
 588         const be64 bitcount = cpu_to_be64(ctx->bytecount * 8);
 589
 590         ctx->buffer[buffered++] = 0x80;
 591         if (buffered > SHA1_BLOCK_SIZE - 8) {
 592                 memset(&ctx->buffer[buffered], 0, SHA1_BLOCK_SIZE - buffered);
 593                 sha1_blocks(ctx->h, ctx->buffer, 1);
 594                 buffered = 0;
 595         }
 596         memset(&ctx->buffer[buffered], 0, SHA1_BLOCK_SIZE - 8 - buffered);
 597         memcpy(&ctx->buffer[SHA1_BLOCK_SIZE - 8], &bitcount, 8);
 598         sha1_blocks(ctx->h, ctx->buffer, 1);
 599
 600         put_unaligned_be32(ctx->h[0], &hash[0]);
 601         put_unaligned_be32(ctx->h[1], &hash[4]);
 602         put_unaligned_be32(ctx->h[2], &hash[8]);
 603         put_unaligned_be32(ctx->h[3], &hash[12]);
 604         put_unaligned_be32(ctx->h[4], &hash[16]);
 605 }
 606
 607 /* Calculate the SHA-1 message digest of the given data. */
 608 void
 609 sha1(const void *data, size_t len, u8 hash[SHA1_HASH_SIZE])
 610 {
 611         struct sha1_ctx ctx;
 612
 613         sha1_init(&ctx);
 614         sha1_update(&ctx, data, len);
 615         sha1_final(&ctx, hash);
 616 }
 617
 618 /* "Null" SHA-1 message digest containing all 0's */
 619 const u8 zero_hash[SHA1_HASH_SIZE];
 620
 621 /* Build a hexadecimal string representation of a SHA-1 message digest. */
 622 void
 623 sprint_hash(const u8 hash[SHA1_HASH_SIZE], tchar strbuf[SHA1_HASH_STRING_LEN])
 624 {
 625         int i;
 626         u8 high, low;
 627
 628         for (i = 0; i < SHA1_HASH_SIZE; i++) {
 629                 high = hash[i] >> 4;
 630                 low = hash[i] & 0xF;
 631                 strbuf[i * 2 + 0] = (high < 10 ? high + '0' : high - 10 + 'a');
 632                 strbuf[i * 2 + 1] = (low  < 10 ? low  + '0' : low  - 10 + 'a');
 633         }
 634         strbuf[i * 2] = 0;
 635 }