wimlib.net Git - wimlib/blob - src/sha1-ssse3.asm

   1 ;---------------------
   2 ;
   3 ;   This code implements two interfaces of SHA-1 update function: 1) working on a single
   4 ;   64-byte block and 2) working on a buffer of multiple 64-bit blocks. Multiple blocks
   5 ;   version of code is software pipelined and faster overall, it is a default. Assemble
   6 ;   with -DINTEL_SHA1_SINGLEBLOCK to select single 64-byte block function interface.
   7 ;
   8 ;   C++ prototypes of implemented functions are below:
   9 ;
  10 ;   #ifndef INTEL_SHA1_SINGLEBLOCK
  11 ;      // Updates 20-byte SHA-1 record in 'hash' for 'num_blocks' consecutive 64-byte blocks
  12 ;      extern "C" void sha1_update_intel(int *hash, const char* input, size_t num_blocks );
  13 ;   #else
  14 ;      // Updates 20-byte SHA-1 record in 'hash' for one 64-byte block pointed by 'input'
  15 ;      extern "C" void sha1_update_intel(int *hash, const char* input);
  16 ;   #endif
  17 ;
  18 ;   Function name 'sha1_update_intel' can be changed in the source or via macro:
  19 ;     -DINTEL_SHA1_UPDATE_FUNCNAME=my_sha1_update_func_name
  20 ;
  21 ;   It implements both UNIX(default) and Windows ABIs, use -DWIN_ABI on Windows
  22 ;
  23 ;   Code checks CPU for SSSE3 support via CPUID feature flag (CPUID.1.ECX.SSSE3[bit 9]==1),
  24 ;   and performs dispatch. Since in most cases the functionality on non-SSSE3 supporting CPUs
  25 ;   is also required, the default (e.g. one being replaced) function can be provided for
  26 ;   dispatch on such CPUs, the name of old function can be changed in the source or via macro:
  27 ;      -DINTEL_SHA1_UPDATE_DEFAULT_DISPATCH=default_sha1_update_function_name
  28 ;
  29 ;   Authors: Maxim Locktyukhin and Ronen Zohar at Intel.com
  30 ;
  31
  32 %ifndef INTEL_SHA1_UPDATE_DEFAULT_DISPATCH
  33 ;; can be replaced with a default SHA-1 update function name
  34 %define INTEL_SHA1_UPDATE_DEFAULT_DISPATCH  sha1_intel_non_ssse3_cpu_stub_
  35 %else
  36 extern  INTEL_SHA1_UPDATE_DEFAULT_DISPATCH
  37 %endif
  38
  39 ;; provide alternative SHA-1 update function's name here
  40 %ifndef INTEL_SHA1_UPDATE_FUNCNAME
  41 %define INTEL_SHA1_UPDATE_FUNCNAME     sha1_update_intel
  42 %endif
  43
  44 global INTEL_SHA1_UPDATE_FUNCNAME
  45
  46
  47 %ifndef INTEL_SHA1_SINGLEBLOCK
  48 %assign multiblock 1
  49 %else
  50 %assign multiblock 0
  51 %endif
  52
  53
  54 bits 64
  55 default rel
  56
  57 %ifdef WIN_ABI
  58 %xdefine arg1 rcx
  59 %xdefine arg2 rdx
  60 %xdefine arg3 r8
  61 %else
  62 %xdefine arg1 rdi
  63 %xdefine arg2 rsi
  64 %xdefine arg3 rdx
  65 %endif
  66
  67 %xdefine ctx arg1
  68 %xdefine buf arg2
  69 %xdefine cnt arg3
  70
  71 %macro REGALLOC 0
  72 %xdefine A ecx
  73 %xdefine B esi
  74 %xdefine C edi
  75 %xdefine D ebp
  76 %xdefine E edx
  77
  78 %xdefine T1 eax
  79 %xdefine T2 ebx
  80 %endmacro
  81
  82 %xdefine K_BASE     r8
  83 %xdefine HASH_PTR   r9
  84 %xdefine BUFFER_PTR r10
  85 %xdefine BUFFER_END r11
  86
  87 %xdefine W_TMP  xmm0
  88 %xdefine W_TMP2 xmm9
  89
  90 %xdefine W0  xmm1
  91 %xdefine W4  xmm2
  92 %xdefine W8  xmm3
  93 %xdefine W12 xmm4
  94 %xdefine W16 xmm5
  95 %xdefine W20 xmm6
  96 %xdefine W24 xmm7
  97 %xdefine W28 xmm8
  98
  99 %xdefine XMM_SHUFB_BSWAP xmm10
 100
 101 ;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer
 102 %xdefine WK(t) (rsp + (t & 15)*4)
 103
 104 ;------------------------------------------------------------------------------
 105 ;
 106 ; macro implements SHA-1 function's body for single or several 64-byte blocks
 107 ; first param: function's name
 108 ; second param: =0 - function implements single 64-byte block hash
 109 ;               =1 - function implements multiple64-byte blocks hash
 110 ;                    3rd function's argument is a number, greater 0, of 64-byte blocks to calc hash for
 111 ;
 112 %macro  SHA1_VECTOR_ASM  2
 113 align 4096
 114 %1:
 115 push rbx
 116 push rbp
 117
 118 %ifdef WIN_ABI
 119 push rdi
 120 push rsi
 121
 122 %xdefine stack_size (16*4 + 16*5 + 8)
 123 %else
 124 %xdefine stack_size (16*4 + 8)
 125 %endif
 126
 127 sub     rsp, stack_size
 128
 129 %ifdef WIN_ABI
 130 %xdefine xmm_save_base (rsp + 16*4)
 131
 132 xmm_mov [xmm_save_base + 0*16], xmm6
 133 xmm_mov [xmm_save_base + 1*16], xmm7
 134 xmm_mov [xmm_save_base + 2*16], xmm8
 135 xmm_mov [xmm_save_base + 3*16], xmm9
 136 xmm_mov [xmm_save_base + 4*16], xmm10
 137 %endif
 138
 139 mov     HASH_PTR, ctx
 140 mov     BUFFER_PTR, buf
 141
 142 %if (%2 == 1)
 143 shl     cnt, 6           ;; mul by 64
 144 add     cnt, buf
 145 mov     BUFFER_END, cnt
 146 %endif
 147
 148 lea     K_BASE, [K_XMM_AR]
 149 xmm_mov XMM_SHUFB_BSWAP, [bswap_shufb_ctl]
 150
 151 SHA1_PIPELINED_MAIN_BODY %2
 152
 153 %ifdef WIN_ABI
 154 xmm_mov xmm6, [xmm_save_base + 0*16]
 155 xmm_mov xmm7, [xmm_save_base + 1*16]
 156 xmm_mov xmm8, [xmm_save_base + 2*16]
 157 xmm_mov xmm9, [xmm_save_base + 3*16]
 158 xmm_mov xmm10,[xmm_save_base + 4*16]
 159 %endif
 160
 161 add rsp, stack_size
 162
 163 %ifdef WIN_ABI
 164 pop rsi
 165 pop rdi
 166 %endif
 167
 168 pop rbp
 169 pop rbx
 170
 171 ret
 172 %endmacro
 173
 174 ;--------------------------------------------
 175 ; macro implements 80 rounds of SHA-1, for one 64-byte block or multiple blocks with s/w pipelining
 176 ; macro param: =0 - process single 64-byte block
 177 ;              =1 - multiple blocks
 178 ;
 179 %macro SHA1_PIPELINED_MAIN_BODY 1
 180
 181 REGALLOC
 182
 183 mov A, [HASH_PTR   ]
 184 mov B, [HASH_PTR+ 4]
 185 mov C, [HASH_PTR+ 8]
 186 mov D, [HASH_PTR+12]
 187
 188 mov E, [HASH_PTR+16]
 189
 190 %assign i 0
 191 %rep    W_PRECALC_AHEAD
 192 W_PRECALC i
 193 %assign i i+1
 194 %endrep
 195
 196 %xdefine F F1
 197
 198 %if (%1 == 1)                         ;; code loops through more than one block
 199 %%_loop:
 200 cmp BUFFER_PTR, K_BASE          ;; we use K_BASE value as a signal of a last block,
 201 jne %%_begin                    ;; it is set below by: cmovae BUFFER_PTR, K_BASE
 202 jmp %%_end
 203
 204 align 32
 205 %%_begin:
 206 %endif
 207 RR A,B,C,D,E,0
 208 RR D,E,A,B,C,2
 209 RR B,C,D,E,A,4
 210 RR E,A,B,C,D,6
 211 RR C,D,E,A,B,8
 212
 213 RR A,B,C,D,E,10
 214 RR D,E,A,B,C,12
 215 RR B,C,D,E,A,14
 216 RR E,A,B,C,D,16
 217 RR C,D,E,A,B,18
 218
 219 %xdefine F F2
 220
 221 RR A,B,C,D,E,20
 222 RR D,E,A,B,C,22
 223 RR B,C,D,E,A,24
 224 RR E,A,B,C,D,26
 225 RR C,D,E,A,B,28
 226
 227 RR A,B,C,D,E,30
 228 RR D,E,A,B,C,32
 229 RR B,C,D,E,A,34
 230 RR E,A,B,C,D,36
 231 RR C,D,E,A,B,38
 232
 233 %xdefine F F3
 234
 235 RR A,B,C,D,E,40
 236 RR D,E,A,B,C,42
 237 RR B,C,D,E,A,44
 238 RR E,A,B,C,D,46
 239 RR C,D,E,A,B,48
 240
 241 RR A,B,C,D,E,50
 242 RR D,E,A,B,C,52
 243 RR B,C,D,E,A,54
 244 RR E,A,B,C,D,56
 245 RR C,D,E,A,B,58
 246
 247 %xdefine F F4
 248
 249 %if (%1 == 1)                         ;; if code loops through more than one block
 250 add   BUFFER_PTR, 64            ;; move to next 64-byte block
 251 cmp   BUFFER_PTR, BUFFER_END    ;; check if current block is the last one
 252 cmovae BUFFER_PTR, K_BASE       ;; smart way to signal the last iteration
 253 %else
 254 %xdefine W_NO_TAIL_PRECALC 1    ;; no software pipelining for single block interface
 255 %endif
 256
 257 RR A,B,C,D,E,60
 258 RR D,E,A,B,C,62
 259 RR B,C,D,E,A,64
 260 RR E,A,B,C,D,66
 261 RR C,D,E,A,B,68
 262
 263 RR A,B,C,D,E,70
 264 RR D,E,A,B,C,72
 265 RR B,C,D,E,A,74
 266 RR E,A,B,C,D,76
 267 RR C,D,E,A,B,78
 268
 269 UPDATE_HASH [HASH_PTR   ],A
 270 UPDATE_HASH [HASH_PTR+ 4],B
 271 UPDATE_HASH [HASH_PTR+ 8],C
 272 UPDATE_HASH [HASH_PTR+12],D
 273 UPDATE_HASH [HASH_PTR+16],E
 274
 275 %if (%1 == 1)
 276 jmp %%_loop
 277
 278 align 32
 279 %%_end:
 280 %endif
 281
 282
 283 %xdefine W_NO_TAIL_PRECALC 0
 284 %xdefine F %error
 285
 286 %endmacro
 287
 288
 289 %macro F1 3
 290 mov T1,%2
 291 xor T1,%3
 292 and T1,%1
 293 xor T1,%3
 294 %endmacro
 295
 296 %macro F2 3
 297 mov T1,%3
 298 xor T1,%2
 299 xor T1,%1
 300 %endmacro
 301
 302 %macro F3 3
 303 mov T1,%2
 304 mov T2,%1
 305 or  T1,%1
 306 and T2,%2
 307 and T1,%3
 308 or  T1,T2
 309 %endmacro
 310
 311 %define F4 F2
 312
 313 %macro UPDATE_HASH 2
 314 add %2, %1
 315 mov %1, %2
 316 %endmacro
 317
 318
 319 %macro W_PRECALC 1
 320 %xdefine i (%1)
 321
 322 %if (i < 20)
 323 %xdefine K_XMM  0
 324 %elif (i < 40)
 325 %xdefine K_XMM  16
 326 %elif (i < 60)
 327 %xdefine K_XMM  32
 328 %else
 329 %xdefine K_XMM  48
 330 %endif
 331
 332 %if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD)))
 333
 334 %if (W_NO_TAIL_PRECALC == 0)
 335
 336 %xdefine i ((%1) % 80)        ;; pre-compute for the next iteration
 337
 338 %if (i == 0)
 339 W_PRECALC_RESET
 340 %endif
 341
 342
 343 W_PRECALC_00_15
 344 %endif
 345
 346 %elif (i < 32)
 347 W_PRECALC_16_31
 348 %elif (i < 80)   ;; rounds 32-79
 349 W_PRECALC_32_79
 350 %endif
 351 %endmacro
 352
 353 %macro W_PRECALC_RESET 0
 354 %xdefine    W             W0
 355 %xdefine    W_minus_04    W4
 356 %xdefine    W_minus_08    W8
 357 %xdefine    W_minus_12    W12
 358 %xdefine    W_minus_16    W16
 359 %xdefine    W_minus_20    W20
 360 %xdefine    W_minus_24    W24
 361 %xdefine    W_minus_28    W28
 362 %xdefine    W_minus_32    W
 363 %endmacro
 364
 365 %macro W_PRECALC_ROTATE 0
 366 %xdefine    W_minus_32    W_minus_28
 367 %xdefine    W_minus_28    W_minus_24
 368 %xdefine    W_minus_24    W_minus_20
 369 %xdefine    W_minus_20    W_minus_16
 370 %xdefine    W_minus_16    W_minus_12
 371 %xdefine    W_minus_12    W_minus_08
 372 %xdefine    W_minus_08    W_minus_04
 373 %xdefine    W_minus_04    W
 374 %xdefine    W             W_minus_32
 375 %endmacro
 376
 377 %xdefine W_PRECALC_AHEAD   16
 378 %xdefine W_NO_TAIL_PRECALC 0
 379
 380
 381 %xdefine xmm_mov            movdqa
 382
 383 %macro W_PRECALC_00_15 0
 384 ;; message scheduling pre-compute for rounds 0-15
 385 %if ((i & 3) == 0)       ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
 386 movdqu W_TMP, [BUFFER_PTR + (i * 4)]
 387 %elif ((i & 3) == 1)
 388 pshufb W_TMP, XMM_SHUFB_BSWAP
 389 movdqa W, W_TMP
 390 %elif ((i & 3) == 2)
 391 paddd  W_TMP, [K_BASE]
 392 %elif ((i & 3) == 3)
 393 movdqa  [WK(i&~3)], W_TMP
 394
 395 W_PRECALC_ROTATE
 396 %endif
 397 %endmacro
 398
 399 %macro W_PRECALC_16_31 0
 400 ;; message scheduling pre-compute for rounds 16-31
 401 ;; calculating last 32 w[i] values in 8 XMM registers
 402 ;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction
 403 ;;
 404 ;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency
 405 ;;
 406 %if ((i & 3) == 0)    ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
 407 movdqa  W, W_minus_12
 408 palignr W, W_minus_16, 8       ;; w[i-14]
 409 movdqa  W_TMP, W_minus_04
 410 psrldq  W_TMP, 4               ;; w[i-3]
 411 pxor    W, W_minus_08
 412 %elif ((i & 3) == 1)
 413 pxor    W_TMP, W_minus_16
 414 pxor    W, W_TMP
 415 movdqa  W_TMP2, W
 416 movdqa  W_TMP, W
 417 pslldq  W_TMP2, 12
 418 %elif ((i & 3) == 2)
 419 psrld   W, 31
 420 pslld   W_TMP, 1
 421 por     W_TMP, W
 422 movdqa  W, W_TMP2
 423 psrld   W_TMP2, 30
 424 pslld   W, 2
 425 %elif ((i & 3) == 3)
 426 pxor    W_TMP, W
 427 pxor    W_TMP, W_TMP2
 428 movdqa  W, W_TMP
 429 paddd   W_TMP, [K_BASE + K_XMM]
 430 movdqa  [WK(i&~3)],W_TMP
 431
 432 W_PRECALC_ROTATE
 433 %endif
 434 %endmacro
 435
 436 %macro W_PRECALC_32_79 0
 437 ;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
 438 ;; instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
 439 ;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
 440 ;;
 441 %if ((i & 3) == 0)    ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
 442 movdqa  W_TMP, W_minus_04
 443 pxor    W, W_minus_28         ;; W is W_minus_32 before xor
 444 palignr W_TMP, W_minus_08, 8
 445 %elif ((i & 3) == 1)
 446 pxor    W, W_minus_16
 447 pxor    W, W_TMP
 448 movdqa  W_TMP, W
 449 %elif ((i & 3) == 2)
 450 psrld   W, 30
 451 pslld   W_TMP, 2
 452 por     W_TMP, W
 453 %elif ((i & 3) == 3)
 454 movdqa  W, W_TMP
 455 paddd   W_TMP, [K_BASE + K_XMM]
 456 movdqa  [WK(i&~3)],W_TMP
 457
 458 W_PRECALC_ROTATE
 459 %endif
 460 %endmacro
 461
 462 %macro RR 6             ;; RR does two rounds of SHA-1 back to back with W pre-calculation
 463
 464 ;;     TEMP = A
 465 ;;     A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i)
 466 ;;     C = ROTATE_LEFT( B, 30 )
 467 ;;     D = C
 468 ;;     E = D
 469 ;;     B = TEMP
 470
 471 W_PRECALC (%6 + W_PRECALC_AHEAD)
 472 F    %2, %3, %4     ;; F returns result in T1
 473 add  %5, [WK(%6)]
 474 rol  %2, 30
 475 mov  T2, %1
 476 add  %4, [WK(%6 + 1)]
 477 rol  T2, 5
 478 add  %5, T1
 479
 480 W_PRECALC (%6 + W_PRECALC_AHEAD + 1)
 481 add  T2, %5
 482 mov  %5, T2
 483 rol  T2, 5
 484 add  %4, T2
 485 F    %1, %2, %3    ;; F returns result in T1
 486 add  %4, T1
 487 rol  %1, 30
 488
 489 ;; write:  %1, %2
 490 ;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3
 491 %endmacro
 492
 493
 494
 495 ;;----------------------
 496 section .data
 497 align 128
 498
 499 %xdefine K1 0x5a827999
 500 %xdefine K2 0x6ed9eba1
 501 %xdefine K3 0x8f1bbcdc
 502 %xdefine K4 0xca62c1d6
 503
 504 align 128
 505 K_XMM_AR:
 506 DD K1, K1, K1, K1
 507 DD K2, K2, K2, K2
 508 DD K3, K3, K3, K3
 509 DD K4, K4, K4, K4
 510
 511 align 16
 512 bswap_shufb_ctl:
 513 DD 00010203h
 514 DD 04050607h
 515 DD 08090a0bh
 516 DD 0c0d0e0fh
 517
 518 ;; dispatch pointer, points to the init routine for the first invocation
 519 sha1_update_intel_dispatched:
 520 DQ  sha1_update_intel_init_
 521
 522 ;;----------------------
 523 section .text
 524 align 4096
 525
 526 SHA1_VECTOR_ASM     sha1_update_intel_ssse3_, multiblock
 527
 528 align 32
 529 sha1_update_intel_init_:       ;; we get here with the first time invocation
 530 call    sha1_update_intel_dispacth_init_
 531 INTEL_SHA1_UPDATE_FUNCNAME:    ;; we get here after init
 532 jmp     qword [sha1_update_intel_dispatched]
 533
 534 ;; CPUID feature flag based dispatch
 535 sha1_update_intel_dispacth_init_:
 536 push    rax
 537 push    rbx
 538 push    rcx
 539 push    rdx
 540 push    rsi
 541
 542 lea     rsi, [INTEL_SHA1_UPDATE_DEFAULT_DISPATCH]
 543
 544 mov     eax, 1
 545 cpuid
 546
 547 test    ecx, 0200h          ;; SSSE3 support, CPUID.1.ECX[bit 9]
 548 jz      _done
 549
 550 lea     rsi, [sha1_update_intel_ssse3_]
 551
 552 _done:
 553 mov     [sha1_update_intel_dispatched], rsi
 554
 555 pop     rsi
 556 pop     rdx
 557 pop     rcx
 558 pop     rbx
 559 pop     rax
 560 ret
 561
 562 ;;----------------------
 563 ;; in the case a default SHA-1 update function implementation was not provided
 564 ;; and code was invoked on a non-SSSE3 supporting CPU, dispatch handles this
 565 ;; failure in a safest way - jumps to the stub function with UD2 instruction below
 566 sha1_intel_non_ssse3_cpu_stub_:
 567 ud2     ;; in the case no default SHA-1 was provided non-SSSE3 CPUs safely fail here
 568 ret
 569
 570 ; END
 571 ;----------------------