From f73332717baf4ab155b57f64ec9dbcd344d7f0c1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 18 Mar 2023 00:17:54 -0700 Subject: [PATCH] SHA-1 rework First, add new SHA-1 implementations that use the x86 and ARM SHA-1 intrinsics, so that these can be taken advantage of in Windows builds. Second, replace sha1-ssse3.asm with an easier-to-maintain implementation using intrinsics, and build a copy of it with AVX+BMI2 enabled. Finally, now that better SHA-1 implementations are included, support for OpenSSL's SHA-1 is no longer very useful, so remove it. I considered going in the other direction: removing all SHA-1 code and relying completely on external libraries. Some issues with that are: - Statically linking OpenSSL into libwim.dll on Windows increases the binary size by over 4x, even when using "no-autoalginit". - OpenSSL has deprecated its easy-to-use SHA-1 API in favor of the EVP API, which is harder to use (everything can fail) and slower. - Windows CryptoAPI is Windows-only (duh) and also has a complex, slow API where every function can fail, so that's not great either. - SHA-1 is considered insecure these days, so it may be unwise to count on its continued support in crypto libraries into the future. So, let's just do it ourselves... --- Makefile.am | 21 +- README | 16 - README.WINDOWS | 1 - build-aux/nasm_lt.sh | 57 ---- configure.ac | 47 --- include/wimlib/sha1.h | 59 ++-- m4/nasm.m4 | 212 ------------ src/integrity.c | 4 +- src/resource.c | 4 +- src/sha1-ssse3.asm | 571 ------------------------------- src/sha1.c | 672 +++++++++++++++++++++++++++++-------- tools/make-windows-release | 7 +- 12 files changed, 558 insertions(+), 1113 deletions(-) delete mode 100755 build-aux/nasm_lt.sh delete mode 100644 m4/nasm.m4 delete mode 100644 src/sha1-ssse3.asm diff --git a/Makefile.am b/Makefile.am index fd2ec083..3579651a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -191,12 +191,7 @@ libwim_la_CFLAGS = \ $(PTHREAD_CFLAGS) \ $(LIBXML2_CFLAGS) \ $(LIBNTFS_3G_CFLAGS) \ - $(LIBFUSE_CFLAGS) \ - $(LIBCRYPTO_CFLAGS) - -# This is to avoid deprecation warnings for OpenSSL's SHA1_* functions. -# Their replacement (EVP) is harder to use and less performant... -libwim_la_CFLAGS += -Wno-deprecated-declarations + $(LIBFUSE_CFLAGS) libwim_la_LDFLAGS = $(AM_LDFLAGS) -version-info 36:0:21 @@ -206,22 +201,8 @@ libwim_la_LIBADD = \ $(LIBNTFS_3G_LIBS) \ $(LIBFUSE_LIBS) \ $(LIBRT_LIBS) \ - $(LIBCRYPTO_LIBS) \ $(PLATFORM_LIBS) -if ENABLE_SSSE3_SHA1 -libwim_la_SOURCES += src/sha1-ssse3.asm -libwim_la_LIBADD += src/sha1-ssse3.lo - -src/sha1-ssse3.lo:src/sha1-ssse3.asm - $(LIBTOOL) --mode=compile $(srcdir)/build-aux/nasm_lt.sh \ - $(NASM) $(NAFLAGS) $(NASM_PLATFORM_FLAGS) \ - -DINTEL_SHA1_UPDATE_FUNCNAME=$(NASM_SYMBOL_PREFIX)sha1_transform_blocks_ssse3 \ - -DINTEL_SHA1_UPDATE_DEFAULT_DISPATCH=$(NASM_SYMBOL_PREFIX)sha1_transform_blocks_default \ - $< -o $@ -endif -EXTRA_DIST += build-aux/nasm_lt.sh - ############################################################################## # Programs # ############################################################################## diff --git a/README b/README index 275e4a78..1952ff24 100644 --- a/README +++ b/README @@ -142,11 +142,6 @@ dependencies were already included and this section is irrelevant. headers for libntfs-3g to be installed. The minimum required version is 2011-4-12, but newer versions contain important bug fixes. -* OpenSSL / libcrypto (optional) - wimlib can use the SHA-1 message digest implementation from libcrypto - (usually provided by OpenSSL) instead of compiling in yet another SHA-1 - implementation. - * cdrkit (optional) * mtools (optional) * syslinux (optional) @@ -181,12 +176,6 @@ This section documents the most important options that may be passed to the The default is --with-fuse when building for Linux, and --without-fuse otherwise. ---without-libcrypto - Build in functions for SHA-1 rather than using external SHA-1 functions - from libcrypto (usually provided by OpenSSL). - - The default is to use libcrypto if it is found on your system. - PORTABILITY wimlib works on both UNIX-like systems (Linux, Mac OS X, FreeBSD, etc.) and @@ -272,11 +261,6 @@ posted to https://wimlib.net/forums/. wimlib is independently developed and does not contain any code, data, or files copyrighted by Microsoft. It is not known to be affected by any patents. -On UNIX-like systems, if you do not want wimlib to be dynamically linked with -libcrypto (OpenSSL), configure with --without-libcrypto. This replaces the SHA1 -implementation with built-in code and there will be no difference in -functionality. - Note: copyright years may be listed using range notation, e.g., 2012-2016, indicating that every year in the range, inclusive, is a copyrightable year that would otherwise be listed individually. diff --git a/README.WINDOWS b/README.WINDOWS index ea4eff2f..5a9bd3b1 100644 --- a/README.WINDOWS +++ b/README.WINDOWS @@ -152,7 +152,6 @@ install the following additional Cygwin packages: - automake (category "Devel") - git (category "Devel") - libtool (category "Devel") - - nasm (category "Devel") - ghostscript (category "Graphics") - wget (category "Web") diff --git a/build-aux/nasm_lt.sh b/build-aux/nasm_lt.sh deleted file mode 100755 index 6cd73294..00000000 --- a/build-aux/nasm_lt.sh +++ /dev/null @@ -1,57 +0,0 @@ -#! /bin/sh -command="" -infile="" -o_opt=no -pic=no -while [ $# -gt 0 ]; do - case "$1" in - -DPIC|-fPIC|-fpic|-Kpic|-KPIC) - if [ "$pic" != "yes" ] ; then - command="$command -DPIC" - pic=yes - fi - ;; - -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \ - -fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64) - # it's a file format specifier for nasm. - command="$command $1" - ;; - -f*) - # maybe a code-generation flag for gcc. - ;; - -[Ii]*) - incdir=`echo "$1" | sed 's/^-[Ii]//'` - if [ "x$incdir" = x -a "x$2" != x ] ; then - case "$2" in - -*) ;; - *) incdir="$2"; shift;; - esac - fi - if [ "x$incdir" != x ] ; then - # In the case of NASM, the trailing slash is necessary. - incdir=`echo "$incdir" | sed 's%/*$%/%'` - command="$command -I$incdir" - fi - ;; - -o*) - o_opt=yes - command="$command $1" - ;; - *.asm) - infile=$1 - command="$command $1" - ;; - *) - command="$command $1" - ;; - esac - shift -done -if [ "$o_opt" != yes ] ; then - # By default, NASM creates an output file - # in the same directory as the input file. - outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o" - command="$command $outfile" -fi -echo $command -exec $command diff --git a/configure.ac b/configure.ac index 81e756d6..8c77b2eb 100644 --- a/configure.ac +++ b/configure.ac @@ -175,53 +175,6 @@ if test "$WITH_FUSE" = "yes"; then fi AM_CONDITIONAL([WITH_FUSE], [test "$WITH_FUSE" = "yes"]) -# ------------------------ SHA-1 implementation --------------------------------- - -AC_MSG_CHECKING([whether to use SSSE3-accelerated SHA-1]) -AC_ARG_ENABLE([ssse3-sha1], - [AS_HELP_STRING([--enable-ssse3-sha1], - [Include SSSE3-accelerated SHA-1 implementation by - Intel. This implies --without-libcrypto.])], - [ENABLE_SSSE3_SHA1=$enableval], - [ENABLE_SSSE3_SHA1=no]) -AC_MSG_RESULT([$ENABLE_SSSE3_SHA1]) - -if test "$ENABLE_SSSE3_SHA1" = "yes" ; then - AC_DEFINE([ENABLE_SSSE3_SHA1], [1], - [Define to 1 if using SSSE3 implementation of SHA-1]) - AC_PROG_NASM - NASM_SYMBOL_PREFIX="" - NASM_PLATFORM_FLAGS="" - if test "$WINDOWS_NATIVE_BUILD" = "yes"; then - NASM_PLATFORM_FLAGS="-DWIN_ABI" - fi - case "$host_os" in - darwin* | rhapsody* | nextstep* | openstep* | macos*) - NASM_SYMBOL_PREFIX="_" - ;; - esac - AC_SUBST([NASM_PLATFORM_FLAGS], [$NASM_PLATFORM_FLAGS]) - AC_SUBST([NASM_SYMBOL_PREFIX], [$NASM_SYMBOL_PREFIX]) -else - AC_MSG_CHECKING([whether to use SHA-1 implementation from system libcrypto]) - AC_ARG_WITH([libcrypto], - [AS_HELP_STRING([--without-libcrypto], - [build in the SHA-1 algorithm, rather than - use external libcrypto from OpenSSL - (default is autodetect)])], - [WITH_LIBCRYPTO=$withval], - [WITH_LIBCRYPTO=auto]) - AC_MSG_RESULT([$WITH_LIBCRYPTO]) - if test "$WITH_LIBCRYPTO" != "no"; then - PKG_CHECK_MODULES([LIBCRYPTO], [libcrypto], [ - PKGCONFIG_PRIVATE_REQUIRES="$PKGCONFIG_PRIVATE_REQUIRES libcrypto" - AC_DEFINE([WITH_LIBCRYPTO], [1], - [Define to 1 if using libcrypto SHA-1]) - ], [AC_MSG_WARN([Cannot find libcrypto: using stand-alone SHA-1 code instead])]) - fi -fi -AM_CONDITIONAL([ENABLE_SSSE3_SHA1], [test "$ENABLE_SSSE3_SHA1" = "yes"]) - # ----------------------------- Other options --------------------------------- AC_ARG_WITH(pkgconfigdir, diff --git a/include/wimlib/sha1.h b/include/wimlib/sha1.h index e1def3d9..e1b7c778 100644 --- a/include/wimlib/sha1.h +++ b/include/wimlib/sha1.h @@ -33,7 +33,26 @@ #include "wimlib/types.h" #include "wimlib/util.h" -#define SHA1_HASH_SIZE 20 +#define SHA1_HASH_SIZE 20 +#define SHA1_BLOCK_SIZE 64 + +struct sha1_ctx { + u64 bytecount; + u32 h[5]; + u8 buffer[SHA1_BLOCK_SIZE]; +}; + +extern void +sha1_init(struct sha1_ctx *ctx); + +extern void +sha1_update(struct sha1_ctx *ctx, const void *data, size_t len); + +extern void +sha1_final(struct sha1_ctx *ctx, u8 hash[SHA1_HASH_SIZE]); + +extern void +sha1(const void *data, size_t len, u8 hash[SHA1_HASH_SIZE]); extern const u8 zero_hash[SHA1_HASH_SIZE]; @@ -62,43 +81,7 @@ hashes_equal(const u8 h1[SHA1_HASH_SIZE], const u8 h2[SHA1_HASH_SIZE]) static inline bool is_zero_hash(const u8 *hash) { - return (hash == zero_hash || hashes_equal(hash, zero_hash)); + return hash == zero_hash || hashes_equal(hash, zero_hash); } -#ifdef WITH_LIBCRYPTO - -#include - -#define sha1_init SHA1_Init -#define sha1_update SHA1_Update -#define sha1_final SHA1_Final - -static inline void -sha1(const void *data, size_t len, u8 hash[SHA1_HASH_SIZE]) -{ - SHA1(data, len, hash); -} - -#else /* WITH_LIBCRYPTO */ - -typedef struct { - u64 bytecount; - u32 state[5]; - u8 buffer[64]; -} SHA_CTX; - -extern void -sha1_init(SHA_CTX *ctx); - -extern void -sha1_update(SHA_CTX *ctx, const void *data, size_t len); - -extern void -sha1_final(u8 hash[SHA1_HASH_SIZE], SHA_CTX *ctx); - -extern void -sha1(const void *data, size_t len, u8 hash[SHA1_HASH_SIZE]); - -#endif /* !WITH_LIBCRYPTO */ - #endif /* _WIMLIB_SHA1_H */ diff --git a/m4/nasm.m4 b/m4/nasm.m4 deleted file mode 100644 index dea8669b..00000000 --- a/m4/nasm.m4 +++ /dev/null @@ -1,212 +0,0 @@ -# AC_PROG_NASM -# -------------------------- -# Check that NASM exists and determine flags -AC_DEFUN([AC_PROG_NASM],[ - -AC_CHECK_PROGS(NASM, [nasm nasmw yasm]) -test -z "$NASM" && AC_MSG_ERROR([no nasm (Netwide Assembler) found]) - -AC_MSG_CHECKING([for object file format of host system]) -case "$host_os" in - cygwin* | mingw* | pw32* | interix*) - case "$host_cpu" in - x86_64) - objfmt='Win64-COFF' - ;; - *) - objfmt='Win32-COFF' - ;; - esac - ;; - msdosdjgpp* | go32*) - objfmt='COFF' - ;; - os2-emx*) # not tested - objfmt='MSOMF' # obj - ;; - linux*coff* | linux*oldld*) - objfmt='COFF' # ??? - ;; - linux*aout*) - objfmt='a.out' - ;; - linux*) - case "$host_cpu" in - x86_64) - objfmt='ELF64' - ;; - *) - objfmt='ELF' - ;; - esac - ;; - kfreebsd* | freebsd* | netbsd* | openbsd*) - if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then - objfmt='BSD-a.out' - else - case "$host_cpu" in - x86_64 | amd64) - objfmt='ELF64' - ;; - *) - objfmt='ELF' - ;; - esac - fi - ;; - solaris* | sunos* | sysv* | sco*) - case "$host_cpu" in - x86_64) - objfmt='ELF64' - ;; - *) - objfmt='ELF' - ;; - esac - ;; - darwin* | rhapsody* | nextstep* | openstep* | macos*) - case "$host_cpu" in - x86_64) - objfmt='Mach-O64' - ;; - *) - objfmt='Mach-O' - ;; - esac - ;; - *) - objfmt='ELF ?' - ;; -esac - -AC_MSG_RESULT([$objfmt]) -if test "$objfmt" = 'ELF ?'; then - objfmt='ELF' - AC_MSG_WARN([unexpected host system. assumed that the format is $objfmt.]) -fi - -AC_MSG_CHECKING([for object file format specifier (NAFLAGS) ]) -case "$objfmt" in - MSOMF) NAFLAGS='-fobj -DOBJ32';; - Win32-COFF) NAFLAGS='-fwin32 -DWIN32';; - Win64-COFF) NAFLAGS='-fwin64 -DWIN64 -D__x86_64__';; - COFF) NAFLAGS='-fcoff -DCOFF';; - a.out) NAFLAGS='-faout -DAOUT';; - BSD-a.out) NAFLAGS='-faoutb -DAOUT';; - ELF) NAFLAGS='-felf -DELF';; - ELF64) NAFLAGS='-felf64 -DELF -D__x86_64__';; - RDF) NAFLAGS='-frdf -DRDF';; - Mach-O) NAFLAGS='-fmacho -DMACHO';; - Mach-O64) NAFLAGS='-fmacho64 -DMACHO -D__x86_64__';; -esac -AC_MSG_RESULT([$NAFLAGS]) -AC_SUBST([NAFLAGS]) - -AC_MSG_CHECKING([whether the assembler ($NASM $NAFLAGS) works]) -cat > conftest.asm <&AS_MESSAGE_LOG_FD - cat conftest.asm >&AS_MESSAGE_LOG_FD - rm -rf conftest* - AC_MSG_RESULT(no) - AC_MSG_ERROR([installation or configuration problem: assembler cannot create object files.]) -fi - -AC_MSG_CHECKING([whether the linker accepts assembler output]) -try_nasm='${CC-cc} -o conftest${ac_exeext} $LDFLAGS conftest.o $LIBS 1>&AS_MESSAGE_LOG_FD' -if AC_TRY_EVAL(try_nasm) && test -s conftest${ac_exeext}; then - rm -rf conftest* - AC_MSG_RESULT(yes) -else - rm -rf conftest* - AC_MSG_RESULT(no) - AC_MSG_ERROR([configuration problem: maybe object file format mismatch.]) -fi - -]) - -# AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE -# -------------------------- -# Test whether the assembler is suitable and supports NEON instructions -AC_DEFUN([AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE],[ - ac_good_gnu_arm_assembler=no - ac_save_CC="$CC" - ac_save_CFLAGS="$CFLAGS" - CFLAGS="$CCASFLAGS -x assembler-with-cpp" - CC="$CCAS" - AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ - .text - .fpu neon - .arch armv7a - .object_arch armv4 - .arm - pld [r0] - vmovn.u16 d0, q0]])], ac_good_gnu_arm_assembler=yes) - - ac_use_gas_preprocessor=no - if test "x$ac_good_gnu_arm_assembler" = "xno" ; then - CC="gas-preprocessor.pl $CCAS" - AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ - .text - .fpu neon - .arch armv7a - .object_arch armv4 - .arm - pld [r0] - vmovn.u16 d0, q0]])], ac_use_gas_preprocessor=yes) - fi - CFLAGS="$ac_save_CFLAGS" - CC="$ac_save_CC" - - if test "x$ac_use_gas_preprocessor" = "xyes" ; then - CCAS="gas-preprocessor.pl $CCAS" - AC_SUBST([CCAS]) - ac_good_gnu_arm_assembler=yes - fi - - if test "x$ac_good_gnu_arm_assembler" = "xyes" ; then - $1 - else - $2 - fi -]) - -# AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE -# -------------------------- -# Test whether the assembler is suitable and supports MIPS instructions -AC_DEFUN([AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE],[ - have_mips_dspr2=no - ac_save_CFLAGS="$CFLAGS" - CFLAGS="$CCASFLAGS -mdspr2" - - AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ - - int main () - { - int c = 0, a = 0, b = 0; - __asm__ __volatile__ ( - "precr.qb.ph %[c], %[a], %[b] \n\t" - : [c] "=r" (c) - : [a] "r" (a), [b] "r" (b) - ); - return c; - } - ]])], have_mips_dspr2=yes) - CFLAGS=$ac_save_CFLAGS - - if test "x$have_mips_dspr2" = "xyes" ; then - $1 - else - $2 - fi -]) diff --git a/src/integrity.c b/src/integrity.c index 7494c324..e0d4447c 100644 --- a/src/integrity.c +++ b/src/integrity.c @@ -59,7 +59,7 @@ calculate_chunk_sha1(struct filedes *in_fd, size_t this_chunk_size, off_t offset, u8 sha1_md[]) { u8 buf[BUFFER_SIZE]; - SHA_CTX ctx; + struct sha1_ctx ctx; size_t bytes_remaining; size_t bytes_to_read; int ret; @@ -78,7 +78,7 @@ calculate_chunk_sha1(struct filedes *in_fd, size_t this_chunk_size, bytes_remaining -= bytes_to_read; offset += bytes_to_read; } while (bytes_remaining); - sha1_final(sha1_md, &ctx); + sha1_final(&ctx, sha1_md); return 0; } diff --git a/src/resource.c b/src/resource.c index afba8774..a643ce03 100644 --- a/src/resource.c +++ b/src/resource.c @@ -976,7 +976,7 @@ blobifier_cb(const void *chunk, size_t size, void *_ctx) } struct hasher_context { - SHA_CTX sha_ctx; + struct sha1_ctx sha_ctx; int flags; struct read_blob_callbacks cbs; }; @@ -1089,7 +1089,7 @@ hasher_end_blob(struct blob_descriptor *blob, int status, void *_ctx) } /* Retrieve the final SHA-1 message digest. */ - sha1_final(hash, &ctx->sha_ctx); + sha1_final(&ctx->sha_ctx, hash); /* Set the SHA-1 message digest of the blob, or compare the calculated * value with stored value. */ diff --git a/src/sha1-ssse3.asm b/src/sha1-ssse3.asm deleted file mode 100644 index c3b07abb..00000000 --- a/src/sha1-ssse3.asm +++ /dev/null @@ -1,571 +0,0 @@ -;--------------------- -; -; This code implements two interfaces of SHA-1 update function: 1) working on a single -; 64-byte block and 2) working on a buffer of multiple 64-bit blocks. Multiple blocks -; version of code is software pipelined and faster overall, it is a default. Assemble -; with -DINTEL_SHA1_SINGLEBLOCK to select single 64-byte block function interface. -; -; C++ prototypes of implemented functions are below: -; -; #ifndef INTEL_SHA1_SINGLEBLOCK -; // Updates 20-byte SHA-1 record in 'hash' for 'num_blocks' consecutive 64-byte blocks -; extern "C" void sha1_update_intel(int *hash, const char* input, size_t num_blocks ); -; #else -; // Updates 20-byte SHA-1 record in 'hash' for one 64-byte block pointed by 'input' -; extern "C" void sha1_update_intel(int *hash, const char* input); -; #endif -; -; Function name 'sha1_update_intel' can be changed in the source or via macro: -; -DINTEL_SHA1_UPDATE_FUNCNAME=my_sha1_update_func_name -; -; It implements both UNIX(default) and Windows ABIs, use -DWIN_ABI on Windows -; -; Code checks CPU for SSSE3 support via CPUID feature flag (CPUID.1.ECX.SSSE3[bit 9]==1), -; and performs dispatch. Since in most cases the functionality on non-SSSE3 supporting CPUs -; is also required, the default (e.g. one being replaced) function can be provided for -; dispatch on such CPUs, the name of old function can be changed in the source or via macro: -; -DINTEL_SHA1_UPDATE_DEFAULT_DISPATCH=default_sha1_update_function_name -; -; Authors: Maxim Locktyukhin and Ronen Zohar at Intel.com -; - -%ifndef INTEL_SHA1_UPDATE_DEFAULT_DISPATCH -;; can be replaced with a default SHA-1 update function name -%define INTEL_SHA1_UPDATE_DEFAULT_DISPATCH sha1_intel_non_ssse3_cpu_stub_ -%else -extern INTEL_SHA1_UPDATE_DEFAULT_DISPATCH -%endif - -;; provide alternative SHA-1 update function's name here -%ifndef INTEL_SHA1_UPDATE_FUNCNAME -%define INTEL_SHA1_UPDATE_FUNCNAME sha1_update_intel -%endif - -global INTEL_SHA1_UPDATE_FUNCNAME - - -%ifndef INTEL_SHA1_SINGLEBLOCK -%assign multiblock 1 -%else -%assign multiblock 0 -%endif - - -bits 64 -default rel - -%ifdef WIN_ABI -%xdefine arg1 rcx -%xdefine arg2 rdx -%xdefine arg3 r8 -%else -%xdefine arg1 rdi -%xdefine arg2 rsi -%xdefine arg3 rdx -%endif - -%xdefine ctx arg1 -%xdefine buf arg2 -%xdefine cnt arg3 - -%macro REGALLOC 0 -%xdefine A ecx -%xdefine B esi -%xdefine C edi -%xdefine D ebp -%xdefine E edx - -%xdefine T1 eax -%xdefine T2 ebx -%endmacro - -%xdefine K_BASE r8 -%xdefine HASH_PTR r9 -%xdefine BUFFER_PTR r10 -%xdefine BUFFER_END r11 - -%xdefine W_TMP xmm0 -%xdefine W_TMP2 xmm9 - -%xdefine W0 xmm1 -%xdefine W4 xmm2 -%xdefine W8 xmm3 -%xdefine W12 xmm4 -%xdefine W16 xmm5 -%xdefine W20 xmm6 -%xdefine W24 xmm7 -%xdefine W28 xmm8 - -%xdefine XMM_SHUFB_BSWAP xmm10 - -;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer -%xdefine WK(t) (rsp + (t & 15)*4) - -;------------------------------------------------------------------------------ -; -; macro implements SHA-1 function's body for single or several 64-byte blocks -; first param: function's name -; second param: =0 - function implements single 64-byte block hash -; =1 - function implements multiple64-byte blocks hash -; 3rd function's argument is a number, greater 0, of 64-byte blocks to calc hash for -; -%macro SHA1_VECTOR_ASM 2 -align 4096 -%1: -push rbx -push rbp - -%ifdef WIN_ABI -push rdi -push rsi - -%xdefine stack_size (16*4 + 16*5 + 8) -%else -%xdefine stack_size (16*4 + 8) -%endif - -sub rsp, stack_size - -%ifdef WIN_ABI -%xdefine xmm_save_base (rsp + 16*4) - -xmm_mov [xmm_save_base + 0*16], xmm6 -xmm_mov [xmm_save_base + 1*16], xmm7 -xmm_mov [xmm_save_base + 2*16], xmm8 -xmm_mov [xmm_save_base + 3*16], xmm9 -xmm_mov [xmm_save_base + 4*16], xmm10 -%endif - -mov HASH_PTR, ctx -mov BUFFER_PTR, buf - -%if (%2 == 1) -shl cnt, 6 ;; mul by 64 -add cnt, buf -mov BUFFER_END, cnt -%endif - -lea K_BASE, [K_XMM_AR] -xmm_mov XMM_SHUFB_BSWAP, [bswap_shufb_ctl] - -SHA1_PIPELINED_MAIN_BODY %2 - -%ifdef WIN_ABI -xmm_mov xmm6, [xmm_save_base + 0*16] -xmm_mov xmm7, [xmm_save_base + 1*16] -xmm_mov xmm8, [xmm_save_base + 2*16] -xmm_mov xmm9, [xmm_save_base + 3*16] -xmm_mov xmm10,[xmm_save_base + 4*16] -%endif - -add rsp, stack_size - -%ifdef WIN_ABI -pop rsi -pop rdi -%endif - -pop rbp -pop rbx - -ret -%endmacro - -;-------------------------------------------- -; macro implements 80 rounds of SHA-1, for one 64-byte block or multiple blocks with s/w pipelining -; macro param: =0 - process single 64-byte block -; =1 - multiple blocks -; -%macro SHA1_PIPELINED_MAIN_BODY 1 - -REGALLOC - -mov A, [HASH_PTR ] -mov B, [HASH_PTR+ 4] -mov C, [HASH_PTR+ 8] -mov D, [HASH_PTR+12] - -mov E, [HASH_PTR+16] - -%assign i 0 -%rep W_PRECALC_AHEAD -W_PRECALC i -%assign i i+1 -%endrep - -%xdefine F F1 - -%if (%1 == 1) ;; code loops through more than one block -%%_loop: -cmp BUFFER_PTR, K_BASE ;; we use K_BASE value as a signal of a last block, -jne %%_begin ;; it is set below by: cmovae BUFFER_PTR, K_BASE -jmp %%_end - -align 32 -%%_begin: -%endif -RR A,B,C,D,E,0 -RR D,E,A,B,C,2 -RR B,C,D,E,A,4 -RR E,A,B,C,D,6 -RR C,D,E,A,B,8 - -RR A,B,C,D,E,10 -RR D,E,A,B,C,12 -RR B,C,D,E,A,14 -RR E,A,B,C,D,16 -RR C,D,E,A,B,18 - -%xdefine F F2 - -RR A,B,C,D,E,20 -RR D,E,A,B,C,22 -RR B,C,D,E,A,24 -RR E,A,B,C,D,26 -RR C,D,E,A,B,28 - -RR A,B,C,D,E,30 -RR D,E,A,B,C,32 -RR B,C,D,E,A,34 -RR E,A,B,C,D,36 -RR C,D,E,A,B,38 - -%xdefine F F3 - -RR A,B,C,D,E,40 -RR D,E,A,B,C,42 -RR B,C,D,E,A,44 -RR E,A,B,C,D,46 -RR C,D,E,A,B,48 - -RR A,B,C,D,E,50 -RR D,E,A,B,C,52 -RR B,C,D,E,A,54 -RR E,A,B,C,D,56 -RR C,D,E,A,B,58 - -%xdefine F F4 - -%if (%1 == 1) ;; if code loops through more than one block -add BUFFER_PTR, 64 ;; move to next 64-byte block -cmp BUFFER_PTR, BUFFER_END ;; check if current block is the last one -cmovae BUFFER_PTR, K_BASE ;; smart way to signal the last iteration -%else -%xdefine W_NO_TAIL_PRECALC 1 ;; no software pipelining for single block interface -%endif - -RR A,B,C,D,E,60 -RR D,E,A,B,C,62 -RR B,C,D,E,A,64 -RR E,A,B,C,D,66 -RR C,D,E,A,B,68 - -RR A,B,C,D,E,70 -RR D,E,A,B,C,72 -RR B,C,D,E,A,74 -RR E,A,B,C,D,76 -RR C,D,E,A,B,78 - -UPDATE_HASH [HASH_PTR ],A -UPDATE_HASH [HASH_PTR+ 4],B -UPDATE_HASH [HASH_PTR+ 8],C -UPDATE_HASH [HASH_PTR+12],D -UPDATE_HASH [HASH_PTR+16],E - -%if (%1 == 1) -jmp %%_loop - -align 32 -%%_end: -%endif - - -%xdefine W_NO_TAIL_PRECALC 0 -%xdefine F %error - -%endmacro - - -%macro F1 3 -mov T1,%2 -xor T1,%3 -and T1,%1 -xor T1,%3 -%endmacro - -%macro F2 3 -mov T1,%3 -xor T1,%2 -xor T1,%1 -%endmacro - -%macro F3 3 -mov T1,%2 -mov T2,%1 -or T1,%1 -and T2,%2 -and T1,%3 -or T1,T2 -%endmacro - -%define F4 F2 - -%macro UPDATE_HASH 2 -add %2, %1 -mov %1, %2 -%endmacro - - -%macro W_PRECALC 1 -%xdefine i (%1) - -%if (i < 20) -%xdefine K_XMM 0 -%elif (i < 40) -%xdefine K_XMM 16 -%elif (i < 60) -%xdefine K_XMM 32 -%else -%xdefine K_XMM 48 -%endif - -%if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD))) - -%if (W_NO_TAIL_PRECALC == 0) - -%xdefine i ((%1) % 80) ;; pre-compute for the next iteration - -%if (i == 0) -W_PRECALC_RESET -%endif - - -W_PRECALC_00_15 -%endif - -%elif (i < 32) -W_PRECALC_16_31 -%elif (i < 80) ;; rounds 32-79 -W_PRECALC_32_79 -%endif -%endmacro - -%macro W_PRECALC_RESET 0 -%xdefine W W0 -%xdefine W_minus_04 W4 -%xdefine W_minus_08 W8 -%xdefine W_minus_12 W12 -%xdefine W_minus_16 W16 -%xdefine W_minus_20 W20 -%xdefine W_minus_24 W24 -%xdefine W_minus_28 W28 -%xdefine W_minus_32 W -%endmacro - -%macro W_PRECALC_ROTATE 0 -%xdefine W_minus_32 W_minus_28 -%xdefine W_minus_28 W_minus_24 -%xdefine W_minus_24 W_minus_20 -%xdefine W_minus_20 W_minus_16 -%xdefine W_minus_16 W_minus_12 -%xdefine W_minus_12 W_minus_08 -%xdefine W_minus_08 W_minus_04 -%xdefine W_minus_04 W -%xdefine W W_minus_32 -%endmacro - -%xdefine W_PRECALC_AHEAD 16 -%xdefine W_NO_TAIL_PRECALC 0 - - -%xdefine xmm_mov movdqa - -%macro W_PRECALC_00_15 0 -;; message scheduling pre-compute for rounds 0-15 -%if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds -movdqu W_TMP, [BUFFER_PTR + (i * 4)] -%elif ((i & 3) == 1) -pshufb W_TMP, XMM_SHUFB_BSWAP -movdqa W, W_TMP -%elif ((i & 3) == 2) -paddd W_TMP, [K_BASE] -%elif ((i & 3) == 3) -movdqa [WK(i&~3)], W_TMP - -W_PRECALC_ROTATE -%endif -%endmacro - -%macro W_PRECALC_16_31 0 -;; message scheduling pre-compute for rounds 16-31 -;; calculating last 32 w[i] values in 8 XMM registers -;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction -;; -;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency -;; -%if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds -movdqa W, W_minus_12 -palignr W, W_minus_16, 8 ;; w[i-14] -movdqa W_TMP, W_minus_04 -psrldq W_TMP, 4 ;; w[i-3] -pxor W, W_minus_08 -%elif ((i & 3) == 1) -pxor W_TMP, W_minus_16 -pxor W, W_TMP -movdqa W_TMP2, W -movdqa W_TMP, W -pslldq W_TMP2, 12 -%elif ((i & 3) == 2) -psrld W, 31 -pslld W_TMP, 1 -por W_TMP, W -movdqa W, W_TMP2 -psrld W_TMP2, 30 -pslld W, 2 -%elif ((i & 3) == 3) -pxor W_TMP, W -pxor W_TMP, W_TMP2 -movdqa W, W_TMP -paddd W_TMP, [K_BASE + K_XMM] -movdqa [WK(i&~3)],W_TMP - -W_PRECALC_ROTATE -%endif -%endmacro - -%macro W_PRECALC_32_79 0 -;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 -;; instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 -;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken -;; -%if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds -movdqa W_TMP, W_minus_04 -pxor W, W_minus_28 ;; W is W_minus_32 before xor -palignr W_TMP, W_minus_08, 8 -%elif ((i & 3) == 1) -pxor W, W_minus_16 -pxor W, W_TMP -movdqa W_TMP, W -%elif ((i & 3) == 2) -psrld W, 30 -pslld W_TMP, 2 -por W_TMP, W -%elif ((i & 3) == 3) -movdqa W, W_TMP -paddd W_TMP, [K_BASE + K_XMM] -movdqa [WK(i&~3)],W_TMP - -W_PRECALC_ROTATE -%endif -%endmacro - -%macro RR 6 ;; RR does two rounds of SHA-1 back to back with W pre-calculation - -;; TEMP = A -;; A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i) -;; C = ROTATE_LEFT( B, 30 ) -;; D = C -;; E = D -;; B = TEMP - -W_PRECALC (%6 + W_PRECALC_AHEAD) -F %2, %3, %4 ;; F returns result in T1 -add %5, [WK(%6)] -rol %2, 30 -mov T2, %1 -add %4, [WK(%6 + 1)] -rol T2, 5 -add %5, T1 - -W_PRECALC (%6 + W_PRECALC_AHEAD + 1) -add T2, %5 -mov %5, T2 -rol T2, 5 -add %4, T2 -F %1, %2, %3 ;; F returns result in T1 -add %4, T1 -rol %1, 30 - -;; write: %1, %2 -;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3 -%endmacro - - - -;;---------------------- -section .data -align 128 - -%xdefine K1 0x5a827999 -%xdefine K2 0x6ed9eba1 -%xdefine K3 0x8f1bbcdc -%xdefine K4 0xca62c1d6 - -align 128 -K_XMM_AR: -DD K1, K1, K1, K1 -DD K2, K2, K2, K2 -DD K3, K3, K3, K3 -DD K4, K4, K4, K4 - -align 16 -bswap_shufb_ctl: -DD 00010203h -DD 04050607h -DD 08090a0bh -DD 0c0d0e0fh - -;; dispatch pointer, points to the init routine for the first invocation -sha1_update_intel_dispatched: -DQ sha1_update_intel_init_ - -;;---------------------- -section .text -align 4096 - -SHA1_VECTOR_ASM sha1_update_intel_ssse3_, multiblock - -align 32 -sha1_update_intel_init_: ;; we get here with the first time invocation -call sha1_update_intel_dispacth_init_ -INTEL_SHA1_UPDATE_FUNCNAME: ;; we get here after init -jmp qword [sha1_update_intel_dispatched] - -;; CPUID feature flag based dispatch -sha1_update_intel_dispacth_init_: -push rax -push rbx -push rcx -push rdx -push rsi - -lea rsi, [INTEL_SHA1_UPDATE_DEFAULT_DISPATCH] - -mov eax, 1 -cpuid - -test ecx, 0200h ;; SSSE3 support, CPUID.1.ECX[bit 9] -jz _done - -lea rsi, [sha1_update_intel_ssse3_] - -_done: -mov [sha1_update_intel_dispatched], rsi - -pop rsi -pop rdx -pop rcx -pop rbx -pop rax -ret - -;;---------------------- -;; in the case a default SHA-1 update function implementation was not provided -;; and code was invoked on a non-SSSE3 supporting CPU, dispatch handles this -;; failure in a safest way - jumps to the stub function with UD2 instruction below -sha1_intel_non_ssse3_cpu_stub_: -ud2 ;; in the case no default SHA-1 was provided non-SSSE3 CPUs safely fail here -ret - -; END -;---------------------- diff --git a/src/sha1.c b/src/sha1.c index b3062389..9cec68e9 100644 --- a/src/sha1.c +++ b/src/sha1.c @@ -29,211 +29,601 @@ # include "config.h" #endif +#include "wimlib/cpu_features.h" #include "wimlib/endianness.h" #include "wimlib/sha1.h" #include "wimlib/unaligned.h" -/* Dummy SHA-1 message digest of all 0's. This is used in the WIM format to - * mean "SHA-1 not specified". */ -const u8 zero_hash[20]; +/*----------------------------------------------------------------------------* + * Shared helpers * + *----------------------------------------------------------------------------*/ -/* - * Builds a hexadecimal string representation of a SHA-1 message digest. - * - * The output buffer must be at least 41 characters. - */ -void -sprint_hash(const u8 hash[SHA1_HASH_SIZE], tchar strbuf[SHA1_HASH_STRING_LEN]) +static inline u32 +rol32(u32 v, int bits) { - int i; - u8 high, low; - - for (i = 0; i < SHA1_HASH_SIZE; i++) { - high = hash[i] >> 4; - low = hash[i] & 0xF; - strbuf[i * 2 + 0] = (high < 10 ? high + '0' : high - 10 + 'a'); - strbuf[i * 2 + 1] = (low < 10 ? low + '0' : low - 10 + 'a'); - } - strbuf[i * 2] = 0; + return (v << bits) | (v >> (32 - bits)); } -/* If we use libcrypto (e.g. OpenSSL) then we get all the SHA-1 functions for - * free. Otherwise we need to implement them ourselves. */ +/* Expands to the round constant for the given round */ +#define SHA1_K(i) \ + (((i) < 20) ? 0x5A827999 : \ + ((i) < 40) ? 0x6ED9EBA1 : \ + ((i) < 60) ? 0x8F1BBCDC : \ + 0xCA62C1D6) -#ifndef WITH_LIBCRYPTO +/* Expands to the computation on b, c, and d for the given round */ +#define SHA1_F(i, b, c, d) \ + (((i) < 20) ? /* Choice */ (b & (c ^ d)) ^ d : \ + ((i) < 40) ? /* Parity */ b ^ c ^ d : \ + ((i) < 60) ? /* Majority */ (c & d) ^ (b & (c ^ d)) : \ + /* Parity */ b ^ c ^ d) -#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) +/* + * Expands to a memory barrier for the given array, preventing values of the + * array from being cached in registers past the barrier. Use this to prevent + * the compiler from making counter-productive optimizations when there aren't + * enough registers available to hold the full array. + */ +#define FORCE_NOT_CACHED(array) asm volatile("" : "+m" (array)) -#define blk0(i) (tmp[i] = be32_to_cpu(load_be32_unaligned(&(block)[(i) * 4]))) +/* + * Expands to FORCE_NOT_CACHED() if the architecture has 16 or fewer general + * purpose registers, otherwise does nothing. + */ +#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) +# define FORCE_NOT_CACHED_IF_FEW_REGS(array) FORCE_NOT_CACHED(array) +#else +# define FORCE_NOT_CACHED_IF_FEW_REGS(array) (void)(array) +#endif -#define blk(i) (tmp[i & 15] = rol(tmp[(i + 13) & 15] ^ \ - tmp[(i + 8) & 15] ^ \ - tmp[(i + 2) & 15] ^ \ - tmp[(i + 0) & 15], 1)) +/*----------------------------------------------------------------------------* + * Generic implementation * + *----------------------------------------------------------------------------*/ -#define R0(v, w, x, y, z, i) \ - z += ((w & (x ^ y)) ^ y) + blk0(i) + 0x5A827999 + rol(v, 5); \ - w = rol(w, 30); +/* + * This is SHA-1 in portable C code. It computes the message schedule + * just-in-time, in a rolling window of length 16. + */ -#define R1(v, w, x, y, z, i) \ - z += ((w & (x ^ y)) ^ y) + blk(i) + 0x5A827999 + rol(v, 5); \ - w = rol(w, 30); +#define SHA1_GENERIC_ROUND(i, a, b, c, d, e) \ + FORCE_NOT_CACHED_IF_FEW_REGS(w); \ + if ((i) < 16) \ + w[i] = get_unaligned_be32(data + ((i) * 4)); \ + else \ + w[(i) % 16] = rol32(w[((i) - 16) % 16] ^ \ + w[((i) - 14) % 16] ^ \ + w[((i) - 8) % 16] ^ \ + w[((i) - 3) % 16], 1); \ + e += w[(i) % 16] + rol32(a, 5) + SHA1_F((i), b, c, d) + SHA1_K(i); \ + b = rol32(b, 30); + /* implicit: the new (a, b, c, d, e) is the old (e, a, b, c, d) */ + +#define SHA1_GENERIC_5ROUNDS(i) \ + SHA1_GENERIC_ROUND((i) + 0, a, b, c, d, e); \ + SHA1_GENERIC_ROUND((i) + 1, e, a, b, c, d); \ + SHA1_GENERIC_ROUND((i) + 2, d, e, a, b, c); \ + SHA1_GENERIC_ROUND((i) + 3, c, d, e, a, b); \ + SHA1_GENERIC_ROUND((i) + 4, b, c, d, e, a); + +#define SHA1_GENERIC_20ROUNDS(i) \ + SHA1_GENERIC_5ROUNDS((i) + 0); \ + SHA1_GENERIC_5ROUNDS((i) + 5); \ + SHA1_GENERIC_5ROUNDS((i) + 10); \ + SHA1_GENERIC_5ROUNDS((i) + 15); -#define R2(v, w, x, y, z, i) \ - z += (w ^ x ^ y) + blk(i) + 0x6ED9EBA1 + rol(v, 5); \ - w = rol(w, 30); +static void +sha1_blocks_generic(u32 h[5], const void *data, size_t num_blocks) +{ + do { + u32 a = h[0]; + u32 b = h[1]; + u32 c = h[2]; + u32 d = h[3]; + u32 e = h[4]; + u32 w[16]; + + SHA1_GENERIC_20ROUNDS(0); + SHA1_GENERIC_20ROUNDS(20); + SHA1_GENERIC_20ROUNDS(40); + SHA1_GENERIC_20ROUNDS(60); + + h[0] += a; + h[1] += b; + h[2] += c; + h[3] += d; + h[4] += e; + data += SHA1_BLOCK_SIZE; + } while (--num_blocks); +} -#define R3(v, w, x, y, z, i) \ - z += (((w | x) & y) | (w & x)) + blk(i) + 0x8F1BBCDC + rol(v, 5); \ - w = rol(w, 30); +/*----------------------------------------------------------------------------* + * x86 SSSE3 (and AVX+BMI2) implementation * + *----------------------------------------------------------------------------*/ -#define R4(v, w, x, y, z, i) \ - z += (w ^ x ^ y) + blk(i) + 0xCA62C1D6 + rol(v, 5); \ - w = rol(w, 30); +/* + * This is SHA-1 using the x86 SSSE3 instructions. A copy of it is also + * compiled with AVX and BMI2 code generation enabled for improved performance. + * + * Unfortunately this isn't actually much faster than the generic + * implementation, since only the message schedule can be vectorized, not the + * SHA itself. The vectorized computation of the message schedule is + * interleaved with the scalar computation of the SHA itself. + * + * Specifically, 16 rounds ahead of time, the words of the message schedule are + * calculated, the round constants are added to them, and they are stored in a + * temporary array that the scalar code reads from later. This is done 4 words + * at a time, but split into 4 steps, so that one step is executed during each + * round. Rounds 16-31 use the usual formula 'w[i] = rol32(w[i-16] ^ w[i-14] ^ + * w[i-8] ^ w[i-3], 1)', while rounds 32-79 use the equivalent formula 'w[i] = + * rol32(w[i-32] ^ w[i-28] ^ w[i-16] ^ w[i-6], 2)' for improved vectorization. + * + * During rounds 80-95, the first 16 message schedule words for the next block + * are prepared. + */ +#if defined(__i386__) || defined(__x86_64__) +#include + +#define SHA1_SSSE3_PRECALC(i, w0, w1, w2, w3, w4, w5, w6, w7) \ + if ((i) % 20 == 0) \ + k = _mm_set1_epi32(SHA1_K((i) % 80)); \ + if ((i) < 32) { \ + /* + * Vectorized computation of w[i] = rol32(w[i-16] ^ w[i-14] ^ + * w[i-8] ^ w[i-3], 1) for i...i+3, split into 4 steps. + * w[i-16..i+3] are in (w0, w1, w2, w3, w4). + */ \ + if ((i) % 4 == 0) { \ + w4 = _mm_alignr_epi8(w1, w0, 8) ^ w2; \ + t0 = _mm_srli_si128(w3, 4); \ + } else if ((i) % 4 == 1) { \ + t0 ^= w4 ^ w0; \ + t1 = _mm_slli_si128(t0, 12); \ + } else if ((i) % 4 == 2) { \ + t2 = _mm_slli_epi32(t1, 2); \ + w4 = _mm_slli_epi32(t0, 1); \ + t0 = _mm_srli_epi32(t0, 31); \ + t2 ^= _mm_srli_epi32(t1, 30); \ + } else { \ + w4 ^= t0 ^ t2; \ + t0 = _mm_add_epi32(w4, k); \ + _mm_store_si128((__m128i *)&tmp[((i) - 3) % 16], t0); \ + } \ + } else if ((i) < 80) { \ + /* + * Vectorized computation of w[i] = rol32(w[i-32] ^ w[i-28] ^ + * w[i-16] ^ w[i-6], 2) for i...i+3, split into 4 steps. + * w[i-32..i+3] are in (w4, w5, w6, w7, w0, w1, w2, w3, w4); + * note the reuse of w4. + */ \ + if ((i) % 4 == 0) \ + w4 ^= _mm_alignr_epi8(w3, w2, 8); \ + else if ((i) % 4 == 1) \ + w4 ^= w5 ^ w0; \ + else if ((i) % 4 == 2) \ + w4 = _mm_slli_epi32(w4, 2) ^ \ + _mm_srli_epi32(w4, 30); \ + else \ + _mm_store_si128((__m128i *)&tmp[((i) - 3) % 16],\ + _mm_add_epi32(w4, k)); \ + } else if ((i) < 96) { \ + /* Precomputation of w[0..15] for next block */ \ + if ((i) == 80 && --num_blocks != 0) \ + data += SHA1_BLOCK_SIZE; \ + if ((i) % 4 == 0) \ + w0 = _mm_loadu_si128(data + (((i) - 80) * 4)); \ + else if ((i) % 4 == 1) \ + w0 = _mm_shuffle_epi8(w0, bswap32_mask); \ + else if ((i) % 4 == 2) \ + t0 = _mm_add_epi32(w0, k); \ + else \ + _mm_store_si128((__m128i *)&tmp[(i) - 83], t0); \ + } -/* Hash a single 512-bit block. This is the core of the algorithm. */ -static void -sha1_transform_default(u32 state[5], const u8 block[64]) +#define SHA1_SSSE3_2ROUNDS(i, a, b, c, d, e, w0, w1, w2, w3, w4, w5, w6, w7) \ + FORCE_NOT_CACHED(tmp); \ + e += tmp[(i) % 16] + rol32(a, 5) + SHA1_F((i), b, c, d); \ + b = rol32(b, 30); \ + SHA1_SSSE3_PRECALC((i) + 16, w0, w1, w2, w3, w4, w5, w6, w7); \ + FORCE_NOT_CACHED(tmp); \ + d += tmp[((i) + 1) % 16] + rol32(e, 5) + SHA1_F((i) + 1, a, b, c); \ + SHA1_SSSE3_PRECALC((i) + 17, w0, w1, w2, w3, w4, w5, w6, w7); \ + a = rol32(a, 30); + /* implicit: the new (a, b, c, d, e) is the old (d, e, a, b, c) */ + +#define SHA1_SSSE3_4ROUNDS(i, a, b, c, d, e, w0, w1, w2, w3, w4, w5, w6, w7) \ + SHA1_SSSE3_2ROUNDS((i) + 0, a, b, c, d, e, w0, w1, w2, w3, w4, w5, w6, w7); \ + SHA1_SSSE3_2ROUNDS((i) + 2, d, e, a, b, c, w0, w1, w2, w3, w4, w5, w6, w7); \ + /* + * implicit: the new (w0-w7) is the old (w1-w7,w0), + * and the new (a, b, c, d, e) is the old (b, c, d, e, a) + */ + +#define SHA1_SSSE3_20ROUNDS(i, w0, w1, w2, w3, w4, w5, w6, w7) \ + SHA1_SSSE3_4ROUNDS((i) + 0, a, b, c, d, e, w0, w1, w2, w3, w4, w5, w6, w7); \ + SHA1_SSSE3_4ROUNDS((i) + 4, b, c, d, e, a, w1, w2, w3, w4, w5, w6, w7, w0); \ + SHA1_SSSE3_4ROUNDS((i) + 8, c, d, e, a, b, w2, w3, w4, w5, w6, w7, w0, w1); \ + SHA1_SSSE3_4ROUNDS((i) + 12, d, e, a, b, c, w3, w4, w5, w6, w7, w0, w1, w2); \ + SHA1_SSSE3_4ROUNDS((i) + 16, e, a, b, c, d, w4, w5, w6, w7, w0, w1, w2, w3); + /* implicit: the new (w0-w7) is the old (w5-w7,w0-w4) */ + +#define SHA1_SSSE3_BODY \ + const __m128i bswap32_mask = \ + _mm_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4, \ + 11, 10, 9, 8, 15, 14, 13, 12); \ + __m128i w0, w1, w2, w3, w4, w5, w6, w7; \ + __m128i k = _mm_set1_epi32(SHA1_K(0)); \ + u32 tmp[16] __attribute__((aligned(16))); \ + \ + w0 = _mm_shuffle_epi8(_mm_loadu_si128(data + 0), bswap32_mask); \ + w1 = _mm_shuffle_epi8(_mm_loadu_si128(data + 16), bswap32_mask); \ + w2 = _mm_shuffle_epi8(_mm_loadu_si128(data + 32), bswap32_mask); \ + w3 = _mm_shuffle_epi8(_mm_loadu_si128(data + 48), bswap32_mask); \ + _mm_store_si128((__m128i *)&tmp[0], _mm_add_epi32(w0, k)); \ + _mm_store_si128((__m128i *)&tmp[4], _mm_add_epi32(w1, k)); \ + _mm_store_si128((__m128i *)&tmp[8], _mm_add_epi32(w2, k)); \ + _mm_store_si128((__m128i *)&tmp[12], _mm_add_epi32(w3, k)); \ + \ + do { \ + u32 a = h[0]; \ + u32 b = h[1]; \ + u32 c = h[2]; \ + u32 d = h[3]; \ + u32 e = h[4]; \ + __m128i t0, t1, t2; \ + \ + SHA1_SSSE3_20ROUNDS(0, w0, w1, w2, w3, w4, w5, w6, w7); \ + SHA1_SSSE3_20ROUNDS(20, w5, w6, w7, w0, w1, w2, w3, w4); \ + SHA1_SSSE3_20ROUNDS(40, w2, w3, w4, w5, w6, w7, w0, w1); \ + SHA1_SSSE3_20ROUNDS(60, w7, w0, w1, w2, w3, w4, w5, w6); \ + \ + h[0] += a; \ + h[1] += b; \ + h[2] += c; \ + h[3] += d; \ + h[4] += e; \ + \ + /* 'data' and 'num_blocks' were updated at start of round 64. */ \ + } while (num_blocks); + +#define HAVE_SHA1_BLOCKS_X86_SSSE3 +static void __attribute__((target("ssse3"))) +sha1_blocks_x86_ssse3(u32 h[5], const void *data, size_t num_blocks) { - u32 a, b, c, d, e; - u32 tmp[16]; - - /* Copy ctx->state[] to working vars */ - a = state[0]; - b = state[1]; - c = state[2]; - d = state[3]; - e = state[4]; - - /* 4 rounds of 20 operations each. Loop unrolled. */ - R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3); - R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7); - R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11); - R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15); - R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19); - R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23); - R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27); - R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31); - R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35); - R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39); - R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43); - R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47); - R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51); - R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55); - R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59); - R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63); - R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67); - R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71); - R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75); - R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79); - - /* Add the working vars back into context.state[] */ - state[0] += a; - state[1] += b; - state[2] += c; - state[3] += d; - state[4] += e; + SHA1_SSSE3_BODY; } -#ifdef ENABLE_SSSE3_SHA1 -extern void -sha1_transform_blocks_ssse3(u32 state[5], const void *data, size_t num_blocks); -extern void -sha1_transform_blocks_default(u32 state[5], const void *data, size_t num_blocks); -# define sha1_transform_blocks sha1_transform_blocks_ssse3 -#else -# define sha1_transform_blocks sha1_transform_blocks_default +#define HAVE_SHA1_BLOCKS_X86_AVX_BMI2 +static void __attribute__((target("avx,bmi2"))) +sha1_blocks_x86_avx_bmi2(u32 h[5], const void *data, size_t num_blocks) +{ + SHA1_SSSE3_BODY; +} +#endif /* x86 SSSE3 (and AVX+BMI2) implementation */ + +/*----------------------------------------------------------------------------* + * x86 SHA Extensions implementation * + *----------------------------------------------------------------------------*/ + +/* + * This is SHA-1 using the x86 SHA extensions. + * + * The SHA1RNDS4 instruction does most of the work. It takes in a 128-bit + * vector containing 'a', 'b', 'c', and 'd' (high-order to low-order), a 128-bit + * vector containing the next 4 words of the message schedule with 'e' added to + * the high-order word, and an immediate that identifies the current 20-round + * section. It does 4 rounds and updates 'a', 'b', 'c', and 'd' accordingly. + * + * Each SHA1RNDS4 is paired with SHA1NEXTE. It takes in the abcd vector, + * calculates the value of 'e' after 4 rounds, and adds it to the high-order + * word of a vector that contains the next 4 words of the message schedule. + * + * Each 4 words of the message schedule for rounds 16-79 is calculated as + * rol32(w[i-16] ^ w[i-14] ^ w[i-8] ^ w[i-3], 1) in three steps using the + * SHA1MSG1, PXOR, and SHA1MSG2 instructions. This happens in a rolling window, + * so during the j'th set of 4 rounds we do the SHA1MSG2 step for j+1'th set of + * message schedule words, PXOR for j+2'th set, and SHA1MSG1 for the j+3'th set. + */ +#if defined(__i386__) || defined(__x86_64__) +#include + +#define SHA1_NI_4ROUNDS(i, w0, w1, w2, w3, we0, we1) \ + if ((i) < 16) \ + w0 = _mm_shuffle_epi8( \ + _mm_loadu_si128(data + ((i) * 4)), bswap_mask); \ + if ((i) == 0) \ + we0 = _mm_add_epi32(h_e, w0); \ + else \ + we0 = _mm_sha1nexte_epu32(/* old abcd */ we0, w0); \ + we1 = abcd; \ + if ((i) >= 12 && (i) < 76) \ + w1 = _mm_sha1msg2_epu32(w1, w0); \ + abcd = _mm_sha1rnds4_epu32(abcd, we0, (i) / 20); \ + if ((i) >= 8 && (i) < 72) \ + w2 ^= w0; \ + if ((i) >= 4 && (i) < 68) \ + w3 = _mm_sha1msg1_epu32(w3, w0); \ + /* + * implicit: the new (w0, w1, w2, w3) is the old (w1, w2, w3, w0), + * and the new (we0, we1) is the old (we1, we0) + */ + +#define SHA1_NI_16ROUNDS(i) \ + SHA1_NI_4ROUNDS((i) + 0, w0, w1, w2, w3, we0, we1); \ + SHA1_NI_4ROUNDS((i) + 4, w1, w2, w3, w0, we1, we0); \ + SHA1_NI_4ROUNDS((i) + 8, w2, w3, w0, w1, we0, we1); \ + SHA1_NI_4ROUNDS((i) + 12, w3, w0, w1, w2, we1, we0); + +#define HAVE_SHA1_BLOCKS_X86_SHA +static void __attribute__((target("sha,sse4.1"))) +sha1_blocks_x86_sha(u32 h[5], const void *data, size_t num_blocks) +{ + const __m128i bswap_mask = + _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0); + __m128i h_abcd = _mm_shuffle_epi32( + _mm_loadu_si128((__m128i *)h), 0x1B); + __m128i h_e = _mm_setr_epi32(0, 0, 0, h[4]); + + do { + __m128i abcd = h_abcd; + __m128i w0, w1, w2, w3, we0, we1; + + SHA1_NI_16ROUNDS(0); + SHA1_NI_16ROUNDS(16); + SHA1_NI_16ROUNDS(32); + SHA1_NI_16ROUNDS(48); + SHA1_NI_16ROUNDS(64); + + h_abcd = _mm_add_epi32(h_abcd, abcd); + h_e = _mm_sha1nexte_epu32(we0, h_e); + data += SHA1_BLOCK_SIZE; + } while (--num_blocks); + + _mm_storeu_si128((__m128i *)h, _mm_shuffle_epi32(h_abcd, 0x1B)); + h[4] = _mm_extract_epi32(h_e, 3); +} +#endif /* x86 SHA Extensions implementation */ + +/*----------------------------------------------------------------------------* + * ARMv8 Crypto Extensions implementation * + *----------------------------------------------------------------------------*/ + +/* + * This is SHA-1 using the ARMv8 Crypto Extensions. + * + * This does 4 rounds at a time, and it works very similarily to the x86 SHA + * Extensions implementation. The differences are fairly minor: + * + * - x86 has SHA1RNDS4 that takes an immediate that identifies the set of 20 + * rounds, and it handles adding the round constants. ARM has SHA1C for + * rounds 0-19, SHA1P for rounds 20-39 and 60-79, and SHA1M for rounds 40-59. + * These don't add the round constants, so that must be done separately. + * + * - ARM needs only two instructions, instead of x86's three, to prepare each + * set of 4 message schedule words: SHA1SU0 which does w[i-16] ^ w[i-14] ^ + * w[i-8], and SHA1SU1 which XOR's in w[i-3] and rotates left by 1. + */ +#if defined(__aarch64__) && \ + (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 5)) + +/* + * clang's arm_neon.h has a bug where it only defines the SHA-1 intrinsics when + * SHA-2 is enabled in the main target. This prevents them from being used in + * target attribute functions. Work around this by defining __ARM_FEATURE_SHA2. + * + * And yes, the feature it wants is indeed *SHA-2*, not SHA-1. + */ +#if defined(__clang__) && !defined(__ARM_FEATURE_SHA2) +# define __ARM_FEATURE_SHA2 1 +# define USED_SHA2_FEATURE_WORKAROUND +#endif +#include +#ifdef USED_SHA2_FEATURE_WORKAROUND +# undef __ARM_FEATURE_SHA2 #endif -#ifndef ENABLE_SSSE3_SHA1 -static +/* Expands to a vector containing 4 copies of the given round's constant */ +#define SHA1_CE_K(i) \ + ((i) < 20 ? k0 : \ + (i) < 40 ? k1 : \ + (i) < 60 ? k2 : \ + k3) + +/* Expands to the appropriate instruction for the given round */ +#define SHA1_CE_OP(i, abcd, e, w) \ + ((i) < 20 ? vsha1cq_u32((abcd), (e), (w)) : \ + (i) < 40 ? vsha1pq_u32((abcd), (e), (w)) : \ + (i) < 60 ? vsha1mq_u32((abcd), (e), (w)) : \ + vsha1pq_u32((abcd), (e), (w))) + +#define SHA1_CE_4ROUNDS(i, w0, w1, w2, w3, e0, e1) \ + tmp = w0 + SHA1_CE_K(i); \ + e1 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); \ + abcd = SHA1_CE_OP((i), abcd, e0, tmp); \ + if ((i) >= 12 && (i) < 76) \ + w1 = vsha1su1q_u32(w1, w0); \ + if ((i) >= 8 && (i) < 72) \ + w2 = vsha1su0q_u32(w2, w3, w0); + /* + * implicit: the new (w0, w1, w2, w3) is the old (w1, w2, w3, w0), + * and the new (e0, e1) is the old (e1, e0) + */ + +#define SHA1_CE_16ROUNDS(i) \ + SHA1_CE_4ROUNDS((i) + 0, w0, w1, w2, w3, e0, e1); \ + SHA1_CE_4ROUNDS((i) + 4, w1, w2, w3, w0, e1, e0); \ + SHA1_CE_4ROUNDS((i) + 8, w2, w3, w0, w1, e0, e1); \ + SHA1_CE_4ROUNDS((i) + 12, w3, w0, w1, w2, e1, e0); + +#define HAVE_SHA1_BLOCKS_ARM_CE +static void +#ifdef __clang__ + __attribute__((target("crypto"))) +#else + __attribute__((target("+crypto"))) #endif -void -sha1_transform_blocks_default(u32 state[5], const void *data, size_t num_blocks) +sha1_blocks_arm_ce(u32 h[5], const void *data, size_t num_blocks) { + uint32x4_t h_abcd = vld1q_u32(h); + uint32x4_t k0 = vdupq_n_u32(SHA1_K(0)); + uint32x4_t k1 = vdupq_n_u32(SHA1_K(20)); + uint32x4_t k2 = vdupq_n_u32(SHA1_K(40)); + uint32x4_t k3 = vdupq_n_u32(SHA1_K(60)); + do { - sha1_transform_default(state, data); - data += 64; + uint32x4_t abcd = h_abcd; + u32 e0 = h[4], e1; + uint32x4_t tmp, w0, w1, w2, w3; + + w0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 0))); + w1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 16))); + w2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 32))); + w3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 48))); + + SHA1_CE_16ROUNDS(0); + SHA1_CE_16ROUNDS(16); + SHA1_CE_16ROUNDS(32); + SHA1_CE_16ROUNDS(48); + SHA1_CE_16ROUNDS(64); + + h_abcd += abcd; + h[4] += e0; + data += SHA1_BLOCK_SIZE; } while (--num_blocks); + + vst1q_u32(h, h_abcd); } +#endif /* ARMv8 Crypto Extensions implementation */ -/* Initializes the specified SHA-1 context. +/*----------------------------------------------------------------------------* + * Everything else * + *----------------------------------------------------------------------------*/ + +static void +sha1_blocks(u32 h[5], const void *data, size_t num_blocks) +{ +#ifdef HAVE_SHA1_BLOCKS_X86_SHA + if ((cpu_features & (X86_CPU_FEATURE_SHA | X86_CPU_FEATURE_SSE4_1)) == + (X86_CPU_FEATURE_SHA | X86_CPU_FEATURE_SSE4_1)) + return sha1_blocks_x86_sha(h, data, num_blocks); +#endif +#ifdef HAVE_SHA1_BLOCKS_X86_AVX_BMI2 + if ((cpu_features & (X86_CPU_FEATURE_AVX | X86_CPU_FEATURE_BMI2)) == + (X86_CPU_FEATURE_AVX | X86_CPU_FEATURE_BMI2)) + return sha1_blocks_x86_avx_bmi2(h, data, num_blocks); +#endif +#ifdef HAVE_SHA1_BLOCKS_X86_SSSE3 + if (cpu_features & X86_CPU_FEATURE_SSSE3) + return sha1_blocks_x86_ssse3(h, data, num_blocks); +#endif +#ifdef HAVE_SHA1_BLOCKS_ARM_CE + if (cpu_features & ARM_CPU_FEATURE_SHA1) + return sha1_blocks_arm_ce(h, data, num_blocks); +#endif + return sha1_blocks_generic(h, data, num_blocks); +} + +/* + * Initialize the given SHA-1 context. * * After sha1_init(), call sha1_update() zero or more times to provide the data - * to be hashed. Then call sha1_final() to get the final hash. */ + * to be hashed. Then call sha1_final() to get the resulting message digest. + */ void -sha1_init(SHA_CTX *ctx) +sha1_init(struct sha1_ctx *ctx) { ctx->bytecount = 0; - ctx->state[0] = 0x67452301; - ctx->state[1] = 0xEFCDAB89; - ctx->state[2] = 0x98BADCFE; - ctx->state[3] = 0x10325476; - ctx->state[4] = 0xC3D2E1F0; + ctx->h[0] = 0x67452301; + ctx->h[1] = 0xEFCDAB89; + ctx->h[2] = 0x98BADCFE; + ctx->h[3] = 0x10325476; + ctx->h[4] = 0xC3D2E1F0; } -/* Updates the SHA-1 context with @len bytes of data. */ +/* Update the SHA-1 context with @len bytes of data. */ void -sha1_update(SHA_CTX *ctx, const void *data, size_t len) +sha1_update(struct sha1_ctx *ctx, const void *data, size_t len) { - unsigned buffered = ctx->bytecount & 63; + unsigned buffered = ctx->bytecount % SHA1_BLOCK_SIZE; + size_t blocks; ctx->bytecount += len; if (buffered) { - /* Previous block is unfinished. */ - if (len < 64 - buffered) { + unsigned remaining = SHA1_BLOCK_SIZE - buffered; + + if (len < remaining) { memcpy(&ctx->buffer[buffered], data, len); - /* Previous block still unfinished. */ return; - } else { - memcpy(&ctx->buffer[buffered], data, 64 - buffered); - /* Finished the previous block. */ - sha1_transform_blocks(ctx->state, ctx->buffer, 1); - data += 64 - buffered; - len -= 64 - buffered; } + memcpy(&ctx->buffer[buffered], data, remaining); + sha1_blocks(ctx->h, ctx->buffer, 1); + data += remaining; + len -= remaining; } - /* Process blocks directly from the input data. */ - if (len / 64) { - sha1_transform_blocks(ctx->state, data, len / 64); - data += len & ~63; - len &= 63; + blocks = len / SHA1_BLOCK_SIZE; + if (blocks) { + sha1_blocks(ctx->h, data, blocks); + data += blocks * SHA1_BLOCK_SIZE; + len -= blocks * SHA1_BLOCK_SIZE; } - /* Copy any remaining bytes to the buffer. */ if (len) memcpy(ctx->buffer, data, len); } -/* Pad the message and generate the final SHA-1 message digest. */ +/* Finalize the SHA-1 operation and return the resulting message digest. */ void -sha1_final(u8 md[20], SHA_CTX *ctx) +sha1_final(struct sha1_ctx *ctx, u8 hash[SHA1_HASH_SIZE]) { - /* Logically, we must append 1 bit, then a variable number of 0 bits, - * then the message length in bits as a big-endian integer, so that the - * final length is a multiple of the block size. */ - static const u8 padding[64] = {0x80, }; - be64 finalcount = cpu_to_be64(ctx->bytecount << 3); - - sha1_update(ctx, padding, 64 - ((ctx->bytecount + 8) & 63)); - sha1_update(ctx, &finalcount, 8); - - for (int i = 0; i < 5; i++) - store_be32_unaligned(cpu_to_be32(ctx->state[i]), &md[i * 4]); + unsigned buffered = ctx->bytecount % SHA1_BLOCK_SIZE; + const be64 bitcount = cpu_to_be64(ctx->bytecount * 8); + + ctx->buffer[buffered++] = 0x80; + if (buffered > SHA1_BLOCK_SIZE - 8) { + memset(&ctx->buffer[buffered], 0, SHA1_BLOCK_SIZE - buffered); + sha1_blocks(ctx->h, ctx->buffer, 1); + buffered = 0; + } + memset(&ctx->buffer[buffered], 0, SHA1_BLOCK_SIZE - 8 - buffered); + memcpy(&ctx->buffer[SHA1_BLOCK_SIZE - 8], &bitcount, 8); + sha1_blocks(ctx->h, ctx->buffer, 1); + + put_unaligned_be32(ctx->h[0], &hash[0]); + put_unaligned_be32(ctx->h[1], &hash[4]); + put_unaligned_be32(ctx->h[2], &hash[8]); + put_unaligned_be32(ctx->h[3], &hash[12]); + put_unaligned_be32(ctx->h[4], &hash[16]); } /* Calculate the SHA-1 message digest of the given data. */ void sha1(const void *data, size_t len, u8 hash[SHA1_HASH_SIZE]) { - SHA_CTX ctx; + struct sha1_ctx ctx; sha1_init(&ctx); sha1_update(&ctx, data, len); - sha1_final(hash, &ctx); + sha1_final(&ctx, hash); } -#endif /* !WITH_LIBCRYPTO */ +/* "Null" SHA-1 message digest containing all 0's */ +const u8 zero_hash[SHA1_HASH_SIZE]; + +/* Build a hexadecimal string representation of a SHA-1 message digest. */ +void +sprint_hash(const u8 hash[SHA1_HASH_SIZE], tchar strbuf[SHA1_HASH_STRING_LEN]) +{ + int i; + u8 high, low; + + for (i = 0; i < SHA1_HASH_SIZE; i++) { + high = hash[i] >> 4; + low = hash[i] & 0xF; + strbuf[i * 2 + 0] = (high < 10 ? high + '0' : high - 10 + 'a'); + strbuf[i * 2 + 1] = (low < 10 ? low + '0' : low - 10 + 'a'); + } + strbuf[i * 2] = 0; +} diff --git a/tools/make-windows-release b/tools/make-windows-release index 5f5da584..d95c6986 100755 --- a/tools/make-windows-release +++ b/tools/make-windows-release @@ -47,10 +47,6 @@ if ! [ -e config.log ] || ! grep -q "configure: exit 0" config.log || \ [ $# -gt 0 ] then - extra_args= - if [ $ARCH = x86_64 ]; then - extra_args="--enable-ssse3-sha1" - fi # Note: putting -static-libgcc in CC is a workaround for libtool # stripping it: # http://www.gnu.org/software/libtool/manual/libtool.html#Stripped-link-flags @@ -63,8 +59,7 @@ then LDFLAGS="-L$SYSROOT/lib" \ PKG_CONFIG=pkg-config \ PKG_CONFIG_LIBDIR="$SYSROOT/lib/pkgconfig" \ - --without-libcrypto \ - $extra_args "$@" + "$@" $MAKE clean fi $MAKE -- 2.43.0