3 ; This code implements two interfaces of SHA-1 update function: 1) working on a single
4 ; 64-byte block and 2) working on a buffer of multiple 64-bit blocks. Multiple blocks
5 ; version of code is software pipelined and faster overall, it is a default. Assemble
6 ; with -DINTEL_SHA1_SINGLEBLOCK to select single 64-byte block function interface.
8 ; C++ prototypes of implemented functions are below:
10 ; #ifndef INTEL_SHA1_SINGLEBLOCK
11 ; // Updates 20-byte SHA-1 record in 'hash' for 'num_blocks' consecutive 64-byte blocks
12 ; extern "C" void sha1_update_intel(int *hash, const char* input, size_t num_blocks );
14 ; // Updates 20-byte SHA-1 record in 'hash' for one 64-byte block pointed by 'input'
15 ; extern "C" void sha1_update_intel(int *hash, const char* input);
18 ; Function name 'sha1_update_intel' can be changed in the source or via macro:
19 ; -DINTEL_SHA1_UPDATE_FUNCNAME=my_sha1_update_func_name
21 ; It implements both UNIX(default) and Windows ABIs, use -DWIN_ABI on Windows
23 ; Code checks CPU for SSSE3 support via CPUID feature flag (CPUID.1.ECX.SSSE3[bit 9]==1),
24 ; and performs dispatch. Since in most cases the functionality on non-SSSE3 supporting CPUs
25 ; is also required, the default (e.g. one being replaced) function can be provided for
26 ; dispatch on such CPUs, the name of old function can be changed in the source or via macro:
27 ; -DINTEL_SHA1_UPDATE_DEFAULT_DISPATCH=default_sha1_update_function_name
29 ; Authors: Maxim Locktyukhin and Ronen Zohar at Intel.com
32 %ifndef INTEL_SHA1_UPDATE_DEFAULT_DISPATCH
33 ;; can be replaced with a default SHA-1 update function name
34 %define INTEL_SHA1_UPDATE_DEFAULT_DISPATCH sha1_intel_non_ssse3_cpu_stub_
36 extern INTEL_SHA1_UPDATE_DEFAULT_DISPATCH
39 ;; provide alternative SHA-1 update function's name here
40 %ifndef INTEL_SHA1_UPDATE_FUNCNAME
41 %define INTEL_SHA1_UPDATE_FUNCNAME sha1_update_intel
44 global INTEL_SHA1_UPDATE_FUNCNAME
47 %ifndef INTEL_SHA1_SINGLEBLOCK
84 %xdefine BUFFER_PTR r10
85 %xdefine BUFFER_END r11
99 %xdefine XMM_SHUFB_BSWAP xmm10
101 ;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer
102 %xdefine WK(t) (rsp + (t & 15)*4)
104 ;------------------------------------------------------------------------------
106 ; macro implements SHA-1 function's body for single or several 64-byte blocks
107 ; first param: function's name
108 ; second param: =0 - function implements single 64-byte block hash
109 ; =1 - function implements multiple64-byte blocks hash
110 ; 3rd function's argument is a number, greater 0, of 64-byte blocks to calc hash for
112 %macro SHA1_VECTOR_ASM 2
122 %xdefine stack_size (16*4 + 16*5 + 8)
124 %xdefine stack_size (16*4 + 8)
130 %xdefine xmm_save_base (rsp + 16*4)
132 xmm_mov [xmm_save_base + 0*16], xmm6
133 xmm_mov [xmm_save_base + 1*16], xmm7
134 xmm_mov [xmm_save_base + 2*16], xmm8
135 xmm_mov [xmm_save_base + 3*16], xmm9
136 xmm_mov [xmm_save_base + 4*16], xmm10
143 shl cnt, 6 ;; mul by 64
148 lea K_BASE, [K_XMM_AR]
149 xmm_mov XMM_SHUFB_BSWAP, [bswap_shufb_ctl]
151 SHA1_PIPELINED_MAIN_BODY %2
154 xmm_mov xmm6, [xmm_save_base + 0*16]
155 xmm_mov xmm7, [xmm_save_base + 1*16]
156 xmm_mov xmm8, [xmm_save_base + 2*16]
157 xmm_mov xmm9, [xmm_save_base + 3*16]
158 xmm_mov xmm10,[xmm_save_base + 4*16]
174 ;--------------------------------------------
175 ; macro implements 80 rounds of SHA-1, for one 64-byte block or multiple blocks with s/w pipelining
176 ; macro param: =0 - process single 64-byte block
177 ; =1 - multiple blocks
179 %macro SHA1_PIPELINED_MAIN_BODY 1
198 %if (%1 == 1) ;; code loops through more than one block
200 cmp BUFFER_PTR, K_BASE ;; we use K_BASE value as a signal of a last block,
201 jne %%_begin ;; it is set below by: cmovae BUFFER_PTR, K_BASE
249 %if (%1 == 1) ;; if code loops through more than one block
250 add BUFFER_PTR, 64 ;; move to next 64-byte block
251 cmp BUFFER_PTR, BUFFER_END ;; check if current block is the last one
252 cmovae BUFFER_PTR, K_BASE ;; smart way to signal the last iteration
254 %xdefine W_NO_TAIL_PRECALC 1 ;; no software pipelining for single block interface
269 UPDATE_HASH [HASH_PTR ],A
270 UPDATE_HASH [HASH_PTR+ 4],B
271 UPDATE_HASH [HASH_PTR+ 8],C
272 UPDATE_HASH [HASH_PTR+12],D
273 UPDATE_HASH [HASH_PTR+16],E
283 %xdefine W_NO_TAIL_PRECALC 0
332 %if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD)))
334 %if (W_NO_TAIL_PRECALC == 0)
336 %xdefine i ((%1) % 80) ;; pre-compute for the next iteration
348 %elif (i < 80) ;; rounds 32-79
353 %macro W_PRECALC_RESET 0
355 %xdefine W_minus_04 W4
356 %xdefine W_minus_08 W8
357 %xdefine W_minus_12 W12
358 %xdefine W_minus_16 W16
359 %xdefine W_minus_20 W20
360 %xdefine W_minus_24 W24
361 %xdefine W_minus_28 W28
362 %xdefine W_minus_32 W
365 %macro W_PRECALC_ROTATE 0
366 %xdefine W_minus_32 W_minus_28
367 %xdefine W_minus_28 W_minus_24
368 %xdefine W_minus_24 W_minus_20
369 %xdefine W_minus_20 W_minus_16
370 %xdefine W_minus_16 W_minus_12
371 %xdefine W_minus_12 W_minus_08
372 %xdefine W_minus_08 W_minus_04
373 %xdefine W_minus_04 W
374 %xdefine W W_minus_32
377 %xdefine W_PRECALC_AHEAD 16
378 %xdefine W_NO_TAIL_PRECALC 0
381 %xdefine xmm_mov movdqa
383 %macro W_PRECALC_00_15 0
384 ;; message scheduling pre-compute for rounds 0-15
385 %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
386 movdqu W_TMP, [BUFFER_PTR + (i * 4)]
388 pshufb W_TMP, XMM_SHUFB_BSWAP
391 paddd W_TMP, [K_BASE]
393 movdqa [WK(i&~3)], W_TMP
399 %macro W_PRECALC_16_31 0
400 ;; message scheduling pre-compute for rounds 16-31
401 ;; calculating last 32 w[i] values in 8 XMM registers
402 ;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction
404 ;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency
406 %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
408 palignr W, W_minus_16, 8 ;; w[i-14]
409 movdqa W_TMP, W_minus_04
410 psrldq W_TMP, 4 ;; w[i-3]
413 pxor W_TMP, W_minus_16
429 paddd W_TMP, [K_BASE + K_XMM]
430 movdqa [WK(i&~3)],W_TMP
436 %macro W_PRECALC_32_79 0
437 ;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
438 ;; instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
439 ;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
441 %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
442 movdqa W_TMP, W_minus_04
443 pxor W, W_minus_28 ;; W is W_minus_32 before xor
444 palignr W_TMP, W_minus_08, 8
455 paddd W_TMP, [K_BASE + K_XMM]
456 movdqa [WK(i&~3)],W_TMP
462 %macro RR 6 ;; RR does two rounds of SHA-1 back to back with W pre-calculation
465 ;; A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i)
466 ;; C = ROTATE_LEFT( B, 30 )
471 W_PRECALC (%6 + W_PRECALC_AHEAD)
472 F %2, %3, %4 ;; F returns result in T1
480 W_PRECALC (%6 + W_PRECALC_AHEAD + 1)
485 F %1, %2, %3 ;; F returns result in T1
490 ;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3
495 ;;----------------------
499 %xdefine K1 0x5a827999
500 %xdefine K2 0x6ed9eba1
501 %xdefine K3 0x8f1bbcdc
502 %xdefine K4 0xca62c1d6
518 ;; dispatch pointer, points to the init routine for the first invocation
519 sha1_update_intel_dispatched:
520 DQ sha1_update_intel_init_
522 ;;----------------------
526 SHA1_VECTOR_ASM sha1_update_intel_ssse3_, multiblock
529 sha1_update_intel_init_: ;; we get here with the first time invocation
530 call sha1_update_intel_dispacth_init_
531 INTEL_SHA1_UPDATE_FUNCNAME: ;; we get here after init
532 jmp qword [sha1_update_intel_dispatched]
534 ;; CPUID feature flag based dispatch
535 sha1_update_intel_dispacth_init_:
542 lea rsi, [INTEL_SHA1_UPDATE_DEFAULT_DISPATCH]
547 test ecx, 0200h ;; SSSE3 support, CPUID.1.ECX[bit 9]
550 lea rsi, [sha1_update_intel_ssse3_]
553 mov [sha1_update_intel_dispatched], rsi
562 ;;----------------------
563 ;; in the case a default SHA-1 update function implementation was not provided
564 ;; and code was invoked on a non-SSSE3 supporting CPU, dispatch handles this
565 ;; failure in a safest way - jumps to the stub function with UD2 instruction below
566 sha1_intel_non_ssse3_cpu_stub_:
567 ud2 ;; in the case no default SHA-1 was provided non-SSSE3 CPUs safely fail here
571 ;----------------------