skein_block_asm.s revision 312557
1# 2#---------------------------------------------------------------- 3# 64-bit x86 assembler code (gnu as) for Skein block functions 4# 5# Author: Doug Whiting, Hifn/Exar 6# 7# This code is released to the public domain. 8#---------------------------------------------------------------- 9# 10 .text 11 .altmacro 12 .psize 0,128 #list file has no page boundaries 13# 14_MASK_ALL_ = (256+512+1024) #all three algorithm bits 15_MAX_FRAME_ = 240 16# 17################# 18.ifndef SKEIN_USE_ASM 19_USE_ASM_ = _MASK_ALL_ 20.else 21_USE_ASM_ = SKEIN_USE_ASM 22.endif 23################# 24.ifndef SKEIN_LOOP #configure loop unrolling 25_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024 26.else 27_SKEIN_LOOP = SKEIN_LOOP 28 .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line 29.print "+++ SKEIN_LOOP = \_NN_" 30 .endr 31.endif 32# the unroll counts (0 --> fully unrolled) 33SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10 34SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10 35SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10 36# 37SKEIN_ASM_UNROLL = 0 38 .irp _NN_,256,512,1024 39 .if (SKEIN_UNROLL_\_NN_) == 0 40SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_ 41 .endif 42 .endr 43################# 44# 45.ifndef SKEIN_ROUNDS 46ROUNDS_256 = 72 47ROUNDS_512 = 72 48ROUNDS_1024 = 80 49.else 50ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5) 51ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5) 52ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5) 53# only display rounds if default size is changed on command line 54.irp _NN_,256,512,1024 55 .if _USE_ASM_ && \_NN_ 56 .irp _RR_,%(ROUNDS_\_NN_) 57 .if _NN_ < 1024 58.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" 59 .else 60.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" 61 .endif 62 .endr 63 .endif 64.endr 65.endif 66################# 67# 68.ifdef SKEIN_CODE_SIZE 69_SKEIN_CODE_SIZE = (1) 70.else 71.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined 72_SKEIN_CODE_SIZE = (1) 73.else 74_SKEIN_CODE_SIZE = (0) 75.endif 76.endif 77# 78################# 79# 80.ifndef SKEIN_DEBUG 81_SKEIN_DEBUG = 0 82.else 83_SKEIN_DEBUG = 1 84.endif 85################# 86# 87# define offsets of fields in hash context structure 88# 89HASH_BITS = 0 #bits of hash output 90BCNT = 8 + HASH_BITS #number of bytes in BUFFER[] 91TWEAK = 8 + BCNT #tweak values[0..1] 92X_VARS = 16 + TWEAK #chaining vars 93# 94#(Note: buffer[] in context structure is NOT needed here :-) 95# 96KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words 97FIRST_MASK = ~ (1 << 6) 98FIRST_MASK64= ~ (1 << 62) 99# 100# rotation constants for Skein 101# 102RC_256_0_0 = 14 103RC_256_0_1 = 16 104 105RC_256_1_0 = 52 106RC_256_1_1 = 57 107 108RC_256_2_0 = 23 109RC_256_2_1 = 40 110 111RC_256_3_0 = 5 112RC_256_3_1 = 37 113 114RC_256_4_0 = 25 115RC_256_4_1 = 33 116 117RC_256_5_0 = 46 118RC_256_5_1 = 12 119 120RC_256_6_0 = 58 121RC_256_6_1 = 22 122 123RC_256_7_0 = 32 124RC_256_7_1 = 32 125 126RC_512_0_0 = 46 127RC_512_0_1 = 36 128RC_512_0_2 = 19 129RC_512_0_3 = 37 130 131RC_512_1_0 = 33 132RC_512_1_1 = 27 133RC_512_1_2 = 14 134RC_512_1_3 = 42 135 136RC_512_2_0 = 17 137RC_512_2_1 = 49 138RC_512_2_2 = 36 139RC_512_2_3 = 39 140 141RC_512_3_0 = 44 142RC_512_3_1 = 9 143RC_512_3_2 = 54 144RC_512_3_3 = 56 145 146RC_512_4_0 = 39 147RC_512_4_1 = 30 148RC_512_4_2 = 34 149RC_512_4_3 = 24 150 151RC_512_5_0 = 13 152RC_512_5_1 = 50 153RC_512_5_2 = 10 154RC_512_5_3 = 17 155 156RC_512_6_0 = 25 157RC_512_6_1 = 29 158RC_512_6_2 = 39 159RC_512_6_3 = 43 160 161RC_512_7_0 = 8 162RC_512_7_1 = 35 163RC_512_7_2 = 56 164RC_512_7_3 = 22 165 166RC_1024_0_0 = 24 167RC_1024_0_1 = 13 168RC_1024_0_2 = 8 169RC_1024_0_3 = 47 170RC_1024_0_4 = 8 171RC_1024_0_5 = 17 172RC_1024_0_6 = 22 173RC_1024_0_7 = 37 174 175RC_1024_1_0 = 38 176RC_1024_1_1 = 19 177RC_1024_1_2 = 10 178RC_1024_1_3 = 55 179RC_1024_1_4 = 49 180RC_1024_1_5 = 18 181RC_1024_1_6 = 23 182RC_1024_1_7 = 52 183 184RC_1024_2_0 = 33 185RC_1024_2_1 = 4 186RC_1024_2_2 = 51 187RC_1024_2_3 = 13 188RC_1024_2_4 = 34 189RC_1024_2_5 = 41 190RC_1024_2_6 = 59 191RC_1024_2_7 = 17 192 193RC_1024_3_0 = 5 194RC_1024_3_1 = 20 195RC_1024_3_2 = 48 196RC_1024_3_3 = 41 197RC_1024_3_4 = 47 198RC_1024_3_5 = 28 199RC_1024_3_6 = 16 200RC_1024_3_7 = 25 201 202RC_1024_4_0 = 41 203RC_1024_4_1 = 9 204RC_1024_4_2 = 37 205RC_1024_4_3 = 31 206RC_1024_4_4 = 12 207RC_1024_4_5 = 47 208RC_1024_4_6 = 44 209RC_1024_4_7 = 30 210 211RC_1024_5_0 = 16 212RC_1024_5_1 = 34 213RC_1024_5_2 = 56 214RC_1024_5_3 = 51 215RC_1024_5_4 = 4 216RC_1024_5_5 = 53 217RC_1024_5_6 = 42 218RC_1024_5_7 = 41 219 220RC_1024_6_0 = 31 221RC_1024_6_1 = 44 222RC_1024_6_2 = 47 223RC_1024_6_3 = 46 224RC_1024_6_4 = 19 225RC_1024_6_5 = 42 226RC_1024_6_6 = 44 227RC_1024_6_7 = 25 228 229RC_1024_7_0 = 9 230RC_1024_7_1 = 48 231RC_1024_7_2 = 35 232RC_1024_7_3 = 52 233RC_1024_7_4 = 23 234RC_1024_7_5 = 31 235RC_1024_7_6 = 37 236RC_1024_7_7 = 20 237# 238# Input: reg 239# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024 240# 241.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM 242_RCNT_ = RC_\BLK_SIZE&_\ROUND_NUM&_\MIX_NUM 243 .if _RCNT_ #is there anything to do? 244 rolq $_RCNT_,%\reg 245 .endif 246.endm 247# 248#---------------------------------------------------------------- 249# 250# MACROS: define local vars and configure stack 251# 252#---------------------------------------------------------------- 253# declare allocated space on the stack 254.macro StackVar localName,localSize 255\localName = _STK_OFFS_ 256_STK_OFFS_ = _STK_OFFS_+(\localSize) 257.endm #StackVar 258# 259#---------------------------------------------------------------- 260# 261# MACRO: Configure stack frame, allocate local vars 262# 263.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt 264 WCNT = (\BLK_BITS)/64 265# 266_PushCnt_ = 0 #save nonvolatile regs on stack 267 .irp _reg_,rbp,rbx,r12,r13,r14,r15 268 pushq %\_reg_ 269_PushCnt_ = _PushCnt_ + 1 #track count to keep alignment 270 .endr 271# 272_STK_OFFS_ = 0 #starting offset from rsp 273 #---- local variables #<-- rsp 274 StackVar X_stk ,8*(WCNT) #local context vars 275 StackVar ksTwk ,8*3 #key schedule: tweak words 276 StackVar ksKey ,8*(WCNT)+8 #key schedule: key words 277 .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0 278 StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen 279 .endif 280 StackVar Wcopy ,8*(WCNT) #copy of input block 281 .if _SKEIN_DEBUG 282 .if \debugCnt + 0 #temp location for debug X[] info 283 StackVar xDebug_\BLK_BITS ,8*(\debugCnt) 284 .endif 285 .endif 286 .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0 287 StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?) 288tmpStk_\BLK_BITS = align16 #use this 289 .endif 290 #---- saved caller parameters (from regs rdi, rsi, rdx, rcx) 291 StackVar ctxPtr ,8 #context ptr 292 StackVar blkPtr ,8 #pointer to block data 293 StackVar blkCnt ,8 #number of full blocks to process 294 StackVar bitAdd ,8 #bit count to add to tweak 295LOCAL_SIZE = _STK_OFFS_ #size of "local" vars 296 #---- 297 StackVar savRegs,8*_PushCnt_ #saved registers 298 StackVar retAddr,8 #return address 299 #---- caller's stack frame (aligned mod 16) 300# 301# set up the stack frame pointer (rbp) 302# 303FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey 304 .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range 305FRAME_OFFS = _STK_OFFS_ 306 .endif 307F_O = -FRAME_OFFS 308# 309 #put some useful defines in the .lst file (for grep) 310__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE 311__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_ 312__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS 313# 314# Notes on stack frame setup: 315# * the most frequently used variable is X_stk[], based at [rsp+0] 316# * the next most used is the key schedule arrays, ksKey and ksTwk 317# so rbp is "centered" there, allowing short offsets to the key 318# schedule even in 1024-bit Skein case 319# * the Wcopy variables are infrequently accessed, but they have long 320# offsets from both rsp and rbp only in the 1024-bit case. 321# * all other local vars and calling parameters can be accessed 322# with short offsets, except in the 1024-bit case 323# 324 subq $LOCAL_SIZE,%rsp #make room for the locals 325 leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets 326 movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack 327 movq %rsi, blkPtr+F_O(%rbp) 328 movq %rdx, blkCnt+F_O(%rbp) 329 movq %rcx, bitAdd+F_O(%rbp) 330# 331.endm #Setup_Stack 332# 333#---------------------------------------------------------------- 334# 335.macro Reset_Stack 336 addq $LOCAL_SIZE,%rsp #get rid of locals (wipe??) 337 .irp _reg_,r15,r14,r13,r12,rbx,rbp 338 popq %\_reg_ #restore caller's regs 339_PushCnt_ = _PushCnt_ - 1 340 .endr 341 .if _PushCnt_ 342 .error "Mismatched push/pops?" 343 .endif 344.endm # Reset_Stack 345# 346#---------------------------------------------------------------- 347# macros to help debug internals 348# 349.if _SKEIN_DEBUG 350 .extern Skein_Show_Block #calls to C routines 351 .extern Skein_Show_Round 352# 353SKEIN_RND_SPECIAL = 1000 354SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 355SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 356SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 357# 358.macro Skein_Debug_Block BLK_BITS 359# 360#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, 361# const u08b_t *blkPtr, const u64b_t *wPtr, 362# const u64b_t *ksPtr,const u64b_t *tsPtr) 363# 364_NN_ = 0 365 .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11 366 pushq %\_reg_ #save all volatile regs on tack before the call 367_NN_ = _NN_ + 1 368 .endr 369 # get and push call parameters 370 movq $\BLK_BITS ,%rdi #bits 371 movq ctxPtr+F_O(%rbp),%rsi #h (pointer) 372 leaq X_VARS (%rsi),%rdx #X (pointer) 373 movq blkPtr+F_O(%rbp),%rcx #blkPtr 374 leaq Wcopy +F_O(%rbp),%r8 #wPtr 375 leaq ksKey +F_O(%rbp),%r9 #key pointer 376 leaq ksTwk +F_O(%rbp),%rax #tweak pointer 377 pushq %rax # (pass on the stack) 378 call Skein_Show_Block #call external debug handler 379 addq $8*1,%rsp #discard parameters on stack 380 .if (_NN_ % 2 ) == 0 #check stack alignment 381 .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS" 382 .endif 383 .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax 384 popq %\_reg_ #restore regs 385_NN_ = _NN_ - 1 386 .endr 387 .if _NN_ 388 .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS" 389 .endif 390.endm # Skein_Debug_Block 391# 392# the macro to "call" to debug a round 393# 394.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp 395 # call the appropriate (local) debug "function" 396 pushq %rdx #save rdx, so we can use it for round "number" 397 .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL) 398 movq $\R,%rdx 399 .else #compute round number using edi 400_rOffs_ = \RDI_OFFS + 0 401 .if \BLK_BITS == 1024 402 movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above) 403 leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx 404 .else 405 leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx 406 .endif 407 .endif 408 call Skein_Debug_Round_\BLK_BITS 409 popq %rdx #restore origianl rdx value 410# 411 afterOp 412.endm # Skein_Debug_Round 413.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled) 414.macro Skein_Debug_Block BLK_BITS 415.endm 416# 417.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp 418.endm 419# 420.endif # _SKEIN_DEBUG 421# 422#---------------------------------------------------------------- 423# 424.macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs 425 .if \immOffs + 0 426 leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg 427 .elseif ((\useAddOp + 0) == 0) 428 .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs! 429 leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg 430 .else 431 addq %\srcReg_A\srcReg_B,%\dstReg 432 .endif 433 .else 434 addq %\srcReg_A\srcReg_B,%\dstReg 435 .endif 436.endm 437 438# keep Intel-style ordering here, to match addReg 439.macro xorReg dstReg,srcReg_A,srcReg_B 440 xorq %\srcReg_A\srcReg_B,%\dstReg 441.endm 442# 443#---------------------------------------------------------------- 444# 445.macro C_label lName 446 \lName: #use both "genders" to work across linkage conventions 447_\lName: 448 .global \lName 449 .global _\lName 450.endm 451# 452#=================================== Skein_256 ============================================= 453# 454.if _USE_ASM_ & 256 455# 456# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# 457# 458################# 459# 460# code 461# 462C_label Skein_256_Process_Block 463 Setup_Stack 256,((ROUNDS_256/8)+1) 464 movq TWEAK+8(%rdi),%r14 465 jmp Skein_256_block_loop 466 .p2align 4 467 # main hash loop for Skein_256 468Skein_256_block_loop: 469 # 470 # general register usage: 471 # RAX..RDX = X0..X3 472 # R08..R12 = ks[0..4] 473 # R13..R15 = ts[0..2] 474 # RSP, RBP = stack/frame pointers 475 # RDI = round counter or context pointer 476 # RSI = temp 477 # 478 movq TWEAK+0(%rdi) ,%r13 479 addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0 480 movq %r14 ,%r15 481 xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak 482 483 movq $KW_PARITY ,%r12 484 movq X_VARS+ 0(%rdi),%r8 485 movq X_VARS+ 8(%rdi),%r9 486 movq X_VARS+16(%rdi),%r10 487 movq X_VARS+24(%rdi),%r11 488 movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0] 489 xorq %r8 ,%r12 #start accumulating overall parity 490 491 movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block 492 xorq %r9 ,%r12 493 movq 0(%rsi) ,%rax #get X[0..3] 494 xorq %r10 ,%r12 495 movq 8(%rsi) ,%rbx 496 xorq %r11 ,%r12 497 movq 16(%rsi) ,%rcx 498 movq 24(%rsi) ,%rdx 499 500 movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block 501 movq %rbx,Wcopy+ 8+F_O(%rbp) 502 movq %rcx,Wcopy+16+F_O(%rbp) 503 movq %rdx,Wcopy+24+F_O(%rbp) 504 505 addq %r8 ,%rax #initial key injection 506 addq %r9 ,%rbx 507 addq %r10,%rcx 508 addq %r11,%rdx 509 addq %r13,%rbx 510 addq %r14,%rcx 511 512.if _SKEIN_DEBUG 513 movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?) 514 movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block 515 movq %r9 ,ksKey+ 8+F_O(%rbp) 516 movq %r10,ksKey+16+F_O(%rbp) 517 movq %r11,ksKey+24+F_O(%rbp) 518 movq %r12,ksKey+32+F_O(%rbp) 519 520 movq %r13,ksTwk+ 0+F_O(%rbp) 521 movq %r14,ksTwk+ 8+F_O(%rbp) 522 movq %r15,ksTwk+16+F_O(%rbp) 523 524 movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block 525 movq %rbx,X_stk + 8(%rsp) 526 movq %rcx,X_stk +16(%rsp) 527 movq %rdx,X_stk +24(%rsp) 528 529 Skein_Debug_Block 256 #debug dump 530 Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL 531.endif 532# 533.if ((SKEIN_ASM_UNROLL & 256) == 0) 534 movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code 535 movq %r9 ,ksKey+ 8+F_O(%rbp) 536 movq %r10,ksKey+16+F_O(%rbp) 537 movq %r11,ksKey+24+F_O(%rbp) 538 movq %r12,ksKey+32+F_O(%rbp) 539 540 movq %r13,ksTwk+24+F_O(%rbp) 541 movq %r14,ksTwk+ 8+F_O(%rbp) 542 movq %r15,ksTwk+16+F_O(%rbp) 543.endif 544 addq $WCNT*8,%rsi #skip the block 545 movq %rsi,blkPtr +F_O(%rbp) #update block pointer 546 # 547 # now the key schedule is computed. Start the rounds 548 # 549.if SKEIN_ASM_UNROLL & 256 550_UNROLL_CNT = ROUNDS_256/8 551.else 552_UNROLL_CNT = SKEIN_UNROLL_256 553 .if ((ROUNDS_256/8) % _UNROLL_CNT) 554 .error "Invalid SKEIN_UNROLL_256" 555 .endif 556 xorq %rdi,%rdi #rdi = iteration count 557Skein_256_round_loop: 558.endif 559_Rbase_ = 0 560.rept _UNROLL_CNT*2 561 # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled) 562 # round 4*_RBase_ + 0 563 addReg rax, rbx 564 RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0 565 addReg rcx, rdx 566 .if (SKEIN_ASM_UNROLL & 256) == 0 567 movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8 568 .endif 569 xorReg rbx, rax 570 RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1 571 xorReg rdx, rcx 572 .if SKEIN_ASM_UNROLL & 256 573 .irp _r0_,%( 8+(_Rbase_+3) % 5) 574 .irp _r1_,%(13+(_Rbase_+2) % 3) 575 leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx 576 .endr 577 .endr 578 .endif 579 .if (SKEIN_ASM_UNROLL & 256) == 0 580 movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13 581 .endif 582 Skein_Debug_Round 256,%(4*_Rbase_+1) 583 584 # round 4*_Rbase_ + 1 585 addReg rax, rdx 586 RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0 587 xorReg rdx, rax 588 .if (SKEIN_ASM_UNROLL & 256) == 0 589 movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9 590 .endif 591 addReg rcx, rbx 592 RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1 593 xorReg rbx, rcx 594 .if (SKEIN_ASM_UNROLL & 256) == 0 595 movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11 596 .endif 597 Skein_Debug_Round 256,%(4*_Rbase_+2) 598 .if SKEIN_ASM_UNROLL & 256 599 .irp _r0_,%( 8+(_Rbase_+2) % 5) 600 .irp _r1_,%(13+(_Rbase_+1) % 3) 601 leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx 602 .endr 603 .endr 604 .endif 605 # round 4*_Rbase_ + 2 606 addReg rax, rbx 607 RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0 608 addReg rcx, rdx 609 .if (SKEIN_ASM_UNROLL & 256) == 0 610 movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10 611 .endif 612 xorReg rbx, rax 613 RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1 614 xorReg rdx, rcx 615 .if (SKEIN_ASM_UNROLL & 256) == 0 616 movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key 617 leaq 1(%r11,%rdi),%r11 #precompute key + tweak 618 .endif 619 Skein_Debug_Round 256,%(4*_Rbase_+3) 620 # round 4*_Rbase_ + 3 621 addReg rax, rdx 622 RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0 623 addReg rcx, rbx 624 .if (SKEIN_ASM_UNROLL & 256) == 0 625 addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak 626 movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak 627 .endif 628 xorReg rdx, rax 629 RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1 630 xorReg rbx, rcx 631 Skein_Debug_Round 256,%(4*_Rbase_+4) 632 .if (SKEIN_ASM_UNROLL & 256) == 0 633 addReg r9 ,r13 #precompute key+tweak 634 .endif 635 #inject key schedule words 636_Rbase_ = _Rbase_+1 637 .if SKEIN_ASM_UNROLL & 256 638 addReg rax,r,%(8+((_Rbase_+0) % 5)) 639 addReg rbx,rsi 640 addReg rcx,rdi 641 addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_ 642 .else 643 incq %rdi 644 addReg rax,r8 645 addReg rcx,r10 646 addReg rbx,r9 647 addReg rdx,r11 648 .endif 649 Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT 650.endr #rept _UNROLL_CNT 651# 652.if (SKEIN_ASM_UNROLL & 256) == 0 653 cmpq $2*(ROUNDS_256/8),%rdi 654 jb Skein_256_round_loop 655.endif # (SKEIN_ASM_UNROLL & 256) == 0 656 movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context 657 658 #---------------------------- 659 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} 660 movq $FIRST_MASK64 ,%r14 661 xorq Wcopy + 0+F_O (%rbp),%rax 662 xorq Wcopy + 8+F_O (%rbp),%rbx 663 xorq Wcopy +16+F_O (%rbp),%rcx 664 xorq Wcopy +24+F_O (%rbp),%rdx 665 andq TWEAK + 8 (%rdi),%r14 666 movq %rax,X_VARS+ 0(%rdi) #store final result 667 movq %rbx,X_VARS+ 8(%rdi) 668 movq %rcx,X_VARS+16(%rdi) 669 movq %rdx,X_VARS+24(%rdi) 670 671 Skein_Debug_Round 256,SKEIN_RND_FEED_FWD 672 673 # go back for more blocks, if needed 674 decq blkCnt+F_O(%rbp) 675 jnz Skein_256_block_loop 676 movq %r14,TWEAK + 8(%rdi) 677 Reset_Stack 678 ret 679Skein_256_Process_Block_End: 680 681 .if _SKEIN_DEBUG 682Skein_Debug_Round_256: #here with rdx == round "number" from macro 683 pushq %rsi #save two regs for BLK_BITS-specific parms 684 pushq %rdi 685 movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi 686 movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it 687 movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!) 688 movq %rcx,X_stk+16+F_O(%rbp) 689 movq %rdi,X_stk+24+F_O(%rbp) 690 691 movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr 692 movq $256,%rdi #now <rdi,rsi,rdx> are set for the call 693 jmp Skein_Debug_Round_Common 694 .endif 695# 696.if _SKEIN_CODE_SIZE 697C_label Skein_256_Process_Block_CodeSize 698 movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax 699 ret 700# 701C_label Skein_256_Unroll_Cnt 702 .if _UNROLL_CNT <> ROUNDS_256/8 703 movq $_UNROLL_CNT,%rax 704 .else 705 xorq %rax,%rax 706 .endif 707 ret 708.endif 709# 710.endif #_USE_ASM_ & 256 711# 712#=================================== Skein_512 ============================================= 713# 714.if _USE_ASM_ & 512 715# 716# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd) 717# 718# X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7) 719# 720################# 721# MACRO: one round for 512-bit blocks 722# 723.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4 724# 725 addReg r\rn0, r\rn1 726 RotL64 r\rn1, 512,%((_Rn_) % 8),0 727 xorReg r\rn1, r\rn0 728 op1 729 addReg r\rn2, r\rn3 730 RotL64 r\rn3, 512,%((_Rn_) % 8),1 731 xorReg r\rn3, r\rn2 732 op2 733 addReg r\rn4, r\rn5 734 RotL64 r\rn5, 512,%((_Rn_) % 8),2 735 xorReg r\rn5, r\rn4 736 op3 737 addReg r\rn6, r\rn7 738 RotL64 r\rn7, 512,%((_Rn_) % 8),3 739 xorReg r\rn7, r\rn6 740 op4 741 Skein_Debug_Round 512,%(_Rn_+1),-4 742# 743.endm #R_512_OneRound 744# 745################# 746# MACRO: eight rounds for 512-bit blocks 747# 748.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8) 749 .if (SKEIN_ASM_UNROLL && 512) 750 # here for fully unrolled case. 751 _II_ = ((_RR_)/4) + 1 #key injection counter 752 R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx> 753 R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx> 754 R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx> 755 R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>, 756 # inject the key schedule 757 addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8 758 addReg r11, rax 759 addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9 760 addReg r12, rbx 761 addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10 762 addReg r13, rcx 763 addReg r14, rdx 764 addReg r15, rsi,,,(_II_) 765 .else 766 # here for looping case #"rotate" key/tweak schedule (move up on stack) 767 incq %rdi #bump key injection counter 768 R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq ksKey-8*1+F_O(%rbp,%rdi,8),%rsi> 769 R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8) >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)> 770 R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq ksKey+8*7+F_O(%rbp,%rdi,8),%rsi> 771 R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx> 772 # inject the key schedule 773 addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8 774 addReg r11, rax 775 addReg r12, rbx 776 addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9 777 addReg r13, rcx 778 addReg r14, rdx 779 addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10 780 addReg r15, rsi 781 addReg r15, rdi #inject the round number 782 .endif 783 784 #show the result of the key injection 785 Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT 786.endm #R_512_EightRounds 787# 788################# 789# instantiated code 790# 791C_label Skein_512_Process_Block 792 Setup_Stack 512,ROUNDS_512/8 793 movq TWEAK+ 8(%rdi),%rbx 794 jmp Skein_512_block_loop 795 .p2align 4 796 # main hash loop for Skein_512 797Skein_512_block_loop: 798 # general register usage: 799 # RAX..RDX = temps for key schedule pre-loads 800 # R8 ..R15 = X0..X7 801 # RSP, RBP = stack/frame pointers 802 # RDI = round counter or context pointer 803 # RSI = temp 804 # 805 movq TWEAK + 0(%rdi),%rax 806 addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0 807 movq %rbx,%rcx 808 xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule 809 movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0] 810 movq %rax,ksTwk+ 0+F_O(%rbp) 811 movq $KW_PARITY,%rdx 812 movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block 813 movq %rbx,ksTwk+ 8+F_O(%rbp) 814 movq %rcx,ksTwk+16+F_O(%rbp) 815 .irp _Rn_,8,9,10,11,12,13,14,15 816 movq X_VARS+8*(_Rn_-8)(%rdi),%r\_Rn_ 817 xorq %r\_Rn_,%rdx #compute overall parity 818 movq %r\_Rn_,ksKey+8*(_Rn_-8)+F_O(%rbp) 819 .endr #load state into %r8 ..%r15, compute parity 820 movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity 821 822 addReg r13,rax #precompute key injection for tweak 823 addReg r14, rbx 824.if _SKEIN_DEBUG 825 movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below 826.endif 827 movq 0(%rsi),%rax #load input block 828 movq 8(%rsi),%rbx 829 movq 16(%rsi),%rcx 830 movq 24(%rsi),%rdx 831 addReg r8 , rax #do initial key injection 832 addReg r9 , rbx 833 movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward 834 movq %rbx,Wcopy+ 8+F_O(%rbp) 835 addReg r10, rcx 836 addReg r11, rdx 837 movq %rcx,Wcopy+16+F_O(%rbp) 838 movq %rdx,Wcopy+24+F_O(%rbp) 839 840 movq 32(%rsi),%rax 841 movq 40(%rsi),%rbx 842 movq 48(%rsi),%rcx 843 movq 56(%rsi),%rdx 844 addReg r12, rax 845 addReg r13, rbx 846 addReg r14, rcx 847 addReg r15, rdx 848 movq %rax,Wcopy+32+F_O(%rbp) 849 movq %rbx,Wcopy+40+F_O(%rbp) 850 movq %rcx,Wcopy+48+F_O(%rbp) 851 movq %rdx,Wcopy+56+F_O(%rbp) 852 853.if _SKEIN_DEBUG 854 .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output 855 movq %r\_Rn_,X_stk+8*(_Rn_-8)(%rsp) 856 .endr 857 858 Skein_Debug_Block 512 #debug dump 859 Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL 860.endif 861 addq $8*WCNT,%rsi #skip the block 862 movq %rsi,blkPtr+F_O(%rbp) #update block pointer 863 # 864 ################# 865 # now the key schedule is computed. Start the rounds 866 # 867.if SKEIN_ASM_UNROLL & 512 868_UNROLL_CNT = ROUNDS_512/8 869.else 870_UNROLL_CNT = SKEIN_UNROLL_512 871 .if ((ROUNDS_512/8) % _UNROLL_CNT) 872 .err "Invalid SKEIN_UNROLL_512" 873 .endif 874 xorq %rdi,%rdi #rdi = round counter 875Skein_512_round_loop: 876.endif 877# 878_Rbase_ = 0 879.rept _UNROLL_CNT*2 880 R_512_FourRounds %(4*_Rbase_+00) 881_Rbase_ = _Rbase_+1 882.endr #rept _UNROLL_CNT 883# 884.if (SKEIN_ASM_UNROLL & 512) == 0 885 cmpq $2*(ROUNDS_512/8),%rdi 886 jb Skein_512_round_loop 887 movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context 888.endif 889 # end of rounds 890 ################# 891 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} 892 .irp _Rn_,8,9,10,11,12,13,14,15 893 .if (_Rn_ == 8) 894 movq $FIRST_MASK64,%rbx 895 .endif 896 xorq Wcopy+8*(_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR 897 movq %r\_Rn_,X_VARS+8*(_Rn_-8)(%rdi) #and store result 898 .if (_Rn_ == 14) 899 andq TWEAK+ 8(%rdi),%rbx 900 .endif 901 .endr 902 Skein_Debug_Round 512,SKEIN_RND_FEED_FWD 903 904 # go back for more blocks, if needed 905 decq blkCnt+F_O(%rbp) 906 jnz Skein_512_block_loop 907 movq %rbx,TWEAK + 8(%rdi) 908 909 Reset_Stack 910 ret 911Skein_512_Process_Block_End: 912# 913 .if _SKEIN_DEBUG 914# call here with rdx = "round number" 915Skein_Debug_Round_512: 916 pushq %rsi #save two regs for BLK_BITS-specific parms 917 pushq %rdi 918 .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it 919 movq %r\_Rn_,X_stk+8*(_Rn_-8)+F_O(%rbp) 920 .endr 921 movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr 922 movq $512,%rdi #now <rdi,rsi,rdx> are set for the call 923 jmp Skein_Debug_Round_Common 924 .endif 925# 926.if _SKEIN_CODE_SIZE 927C_label Skein_512_Process_Block_CodeSize 928 movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax 929 ret 930# 931C_label Skein_512_Unroll_Cnt 932 .if _UNROLL_CNT <> (ROUNDS_512/8) 933 movq $_UNROLL_CNT,%rax 934 .else 935 xorq %rax,%rax 936 .endif 937 ret 938.endif 939# 940.endif # _USE_ASM_ & 512 941# 942#=================================== Skein1024 ============================================= 943.if _USE_ASM_ & 1024 944# 945# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# 946# 947################# 948# use details of permutation to make register assignments 949# 950o1K_rdi = 0 #offsets in X[] associated with each register 951o1K_rsi = 1 952o1K_rbp = 2 953o1K_rax = 3 954o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate 955o1K_rbx = 5 956o1K_rdx = 7 957o1K_r8 = 8 958o1K_r9 = 9 959o1K_r10 = 10 960o1K_r11 = 11 961o1K_r12 = 12 962o1K_r13 = 13 963o1K_r14 = 14 964o1K_r15 = 15 965# 966rIdx_offs = tmpStk_1024 967# 968.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1 969 addReg \reg0 , \reg1 #perform the MIX 970 RotL64 \reg1 , 1024,%((_RN0_) % 8),_Rn1_ 971 xorReg \reg1 , \reg0 972.if ((_RN0_) && 3) == 3 #time to do key injection? 973 .if _SKEIN_DEBUG 974 movq %\reg0 , xDebug_1024+8*w0(%rsp) #save intermediate values for Debug_Round 975 movq %\reg1 , xDebug_1024+8*w1(%rsp) # (before inline key injection) 976 .endif 977_II_ = ((_RN0_)/4)+1 #injection count 978 .if SKEIN_ASM_UNROLL && 1024 #here to do fully unrolled key injection 979 addq ksKey+ 8*((_II_+w0) % 17)(%rsp),%\reg0 980 addq ksKey+ 8*((_II_+w1) % 17)(%rsp),%\reg1 981 .if w1 == 13 #tweak injection 982 addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1 983 .elseif w0 == 14 984 addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0 985 .elseif w1 == 15 986 addq $_II_, %\reg1 #(injection counter) 987 .endif 988 .else #here to do looping key injection 989 .if (w0 == 0) 990 movq %rdi, X_stk+8*w0(%rsp) #if so, store N0 so we can use reg as index 991 movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi 992 .else 993 addq ksKey+8+8*w0(%rsp,%rdi,8),%\reg0 #even key injection 994 .endif 995 .if w1 == 13 #tweak injection 996 addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1 997 .elseif w0 == 14 998 addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0 999 .elseif w1 == 15 1000 addReg \reg1,rdi,,,1 #(injection counter) 1001 .endif 1002 addq ksKey+8+8*w1(%rsp,%rdi,8),%\reg1 #odd key injection 1003 .endif 1004.endif 1005 # insert the op provided, .if any 1006 op1 1007.endm 1008################# 1009# MACRO: four rounds for 1024-bit blocks 1010# 1011.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4) 1012 # should be here with X4 set properly, X6 stored on stack 1013_Rn_ = (_RR_) + 0 1014 r1024_Mix 0, 1,rdi,rsi,_Rn_,0 1015 r1024_Mix 2, 3,rbp,rax,_Rn_,1 1016 r1024_Mix 4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate) 1017 r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack 1018 r1024_Mix 10,11,r10,r11,_Rn_,5 1019 r1024_Mix 12,13,r12,r13,_Rn_,6 1020 r1024_Mix 6, 7,rcx,rdx,_Rn_,3 1021 r1024_Mix 14,15,r14,r15,_Rn_,7 1022 .if _SKEIN_DEBUG 1023 Skein_Debug_Round 1024,%(_Rn_+1) 1024 .endif 1025_Rn_ = (_RR_) + 1 1026 r1024_Mix 0, 9,rdi,r9 ,_Rn_,0 1027 r1024_Mix 2,13,rbp,r13,_Rn_,1 1028 r1024_Mix 6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate) 1029 r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack 1030 r1024_Mix 12, 3,r12,rax,_Rn_,5 1031 r1024_Mix 14, 5,r14,rbx,_Rn_,6 1032 r1024_Mix 4,15,rcx,r15,_Rn_,3 1033 r1024_Mix 8, 1,r8 ,rsi,_Rn_,7 1034 .if _SKEIN_DEBUG 1035 Skein_Debug_Round 1024,%(_Rn_+1) 1036 .endif 1037_Rn_ = (_RR_) + 2 1038 r1024_Mix 0, 7,rdi,rdx,_Rn_,0 1039 r1024_Mix 2, 5,rbp,rbx,_Rn_,1 1040 r1024_Mix 4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate) 1041 r1024_Mix 12,15,r12,r15,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack 1042 r1024_Mix 14,13,r14,r13,_Rn_,5 1043 r1024_Mix 8,11,r8 ,r11,_Rn_,6 1044 r1024_Mix 6, 1,rcx,rsi,_Rn_,3 1045 r1024_Mix 10, 9,r10,r9 ,_Rn_,7 1046 .if _SKEIN_DEBUG 1047 Skein_Debug_Round 1024,%(_Rn_+1) 1048 .endif 1049_Rn_ = (_RR_) + 3 1050 r1024_Mix 0,15,rdi,r15,_Rn_,0 1051 r1024_Mix 2,11,rbp,r11,_Rn_,1 1052 r1024_Mix 6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate) 1053 r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack 1054 r1024_Mix 8, 5,r8 ,rbx,_Rn_,5 1055 r1024_Mix 10, 3,r10,rax,_Rn_,6 1056 r1024_Mix 4, 9,rcx,r9 ,_Rn_,3 1057 r1024_Mix 12, 7,r12,rdx,_Rn_,7 1058 .if _SKEIN_DEBUG 1059 Skein_Debug_Round 1024,%(_Rn_+1) 1060 .endif 1061 1062 .if (SKEIN_ASM_UNROLL && 1024) == 0 #here with rdi == rIdx, X0 on stack 1063 #"rotate" the key schedule on the stack 1064i8 = o1K_r8 1065i0 = o1K_rdi 1066 movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack) 1067 movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word 1068 movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!) 1069 movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word 1070 movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack) 1071 movq X_stk+8*i8(%rsp) ,%r8 #get the reg back 1072 incq %rdi #bump the index 1073 movq %rdi, rIdx_offs (%rsp) #save rdi again 1074 movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back 1075 addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection 1076 .endif 1077 #show the result of the key injection 1078 Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT 1079.endm #r1024_FourRounds 1080# 1081################ 1082# code 1083# 1084C_label Skein1024_Process_Block 1085# 1086 Setup_Stack 1024,ROUNDS_1024/8,WCNT 1087 movq TWEAK+ 8(%rdi),%r9 1088 jmp Skein1024_block_loop 1089 # main hash loop for Skein1024 1090 .p2align 4 1091Skein1024_block_loop: 1092 # general register usage: 1093 # RSP = stack pointer 1094 # RAX..RDX,RSI,RDI = X1, X3..X7 (state words) 1095 # R8 ..R15 = X8..X15 (state words) 1096 # RBP = temp (used for X0 and X2) 1097 # 1098 .if (SKEIN_ASM_UNROLL & 1024) == 0 1099 xorq %rax,%rax #init loop index on the stack 1100 movq %rax,rIdx_offs(%rsp) 1101 .endif 1102 movq TWEAK+ 0(%rdi),%r8 1103 addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0 1104 movq %r9 ,%r10 1105 xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule 1106 movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0] 1107 movq %r8 ,ksTwk+ 0+F_O(%rbp) 1108 movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below 1109 movq %r10,ksTwk+16+F_O(%rbp) 1110 .if _SKEIN_DEBUG 1111 movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block 1112 .endif 1113 movq blkPtr +F_O(%rbp),%rsi # rsi --> input block 1114 movq $KW_PARITY ,%rax #overall key schedule parity 1115 1116 # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3] 1117 .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps 1118 movq X_VARS+8*_rN_(%rdi),%r14 #get state word 1119 movq 8*_rN_(%rsi),%r15 #get msg word 1120 xorq %r14,%rax #update key schedule overall parity 1121 movq %r14,ksKey +8*_rN_+F_O(%rbp) #save key schedule word on stack 1122 movq %r15,Wcopy +8*_rN_+F_O(%rbp) #save local msg Wcopy 1123 addq %r15,%r14 #do the initial key injection 1124 movq %r14,X_stk +8*_rN_ (%rsp) #save initial state var on stack 1125 .endr 1126 # now process the rest, using the "real" registers 1127 # (MUST do it in reverse order to inject tweaks r8/r9 first) 1128 .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx 1129_oo_ = o1K_\_rr_ #offset assocated with the register 1130 movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context 1131 movq 8*_oo_(%rsi),%rcx #get next input msg word 1132 movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack 1133 xorq %\_rr_, %rax #accumulate key schedule parity 1134 movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward 1135 addq %rcx,%\_rr_ #do the initial key injection 1136 .if _oo_ == 13 #do the initial tweak injection 1137 addReg _rr_,r8 # (only in words 13/14) 1138 .elseif _oo_ == 14 1139 addReg _rr_,r9 1140 .endif 1141 .endr 1142 movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity 1143.if _SKEIN_DEBUG 1144 Skein_Debug_Block 1024 #initial debug dump 1145.endif 1146 addq $8*WCNT,%rsi #bump the msg ptr 1147 movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr 1148 # re-load words 0..4 from stack, enter the main loop 1149 .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack) 1150 movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go! 1151 .endr 1152.if _SKEIN_DEBUG 1153 Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection 1154.endif 1155 # 1156 ################# 1157 # now the key schedule is computed. Start the rounds 1158 # 1159.if SKEIN_ASM_UNROLL & 1024 1160_UNROLL_CNT = ROUNDS_1024/8 1161.else 1162_UNROLL_CNT = SKEIN_UNROLL_1024 1163 .if ((ROUNDS_1024/8) % _UNROLL_CNT) 1164 .error "Invalid SKEIN_UNROLL_1024" 1165 .endif 1166Skein1024_round_loop: 1167.endif 1168# 1169_Rbase_ = 0 1170.rept _UNROLL_CNT*2 #implement the rounds, 4 at a time 1171 r1024_FourRounds %(4*_Rbase_+00) 1172_Rbase_ = _Rbase_+1 1173.endr #rept _UNROLL_CNT 1174# 1175.if (SKEIN_ASM_UNROLL & 1024) == 0 1176 cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done 1177 jb Skein1024_round_loop 1178.endif 1179 # end of rounds 1180 ################# 1181 # 1182 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} 1183 movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack 1184 movq ctxPtr(%rsp),%rdx 1185 1186 .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x7 1187_oo_ = o1K_\_rr_ 1188 xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR 1189 movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context 1190 .if (_oo_ == 9) 1191 movq $FIRST_MASK64 ,%r9 1192 .endif 1193 .if (_oo_ == 14) 1194 andq TWEAK+ 8(%rdx),%r9 1195 .endif 1196 .endr 1197 # 1198 movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above) 1199 movq X_stk +8*7(%rsp),%rbx 1200 xorq Wcopy +8*6(%rsp),%rax 1201 xorq Wcopy +8*7(%rsp),%rbx 1202 movq %rax,X_VARS+8*6(%rdx) 1203 decq blkCnt(%rsp) #set zero flag iff done 1204 movq %rbx,X_VARS+8*7(%rdx) 1205 1206 Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)> 1207 # go back for more blocks, if needed 1208 movq ctxPtr(%rsp),%rdi #don't muck with the flags here! 1209 lea FRAME_OFFS(%rsp),%rbp 1210 jnz Skein1024_block_loop 1211 movq %r9 ,TWEAK+ 8(%rdx) 1212 Reset_Stack 1213 ret 1214# 1215Skein1024_Process_Block_End: 1216# 1217.if _SKEIN_DEBUG 1218Skein_Debug_Round_1024: 1219 # call here with rdx = "round number", 1220_SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr 1221 # 1222 #save rest of X[] state on stack so debug routines can access it 1223 .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15 1224 movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp) 1225 .endr 1226 # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack 1227 cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save 1228 jae save_x0 1229 testq $3,%rdx #otherwise only if rdx != 0 mod 4 1230 jz save_x0_not 1231save_x0: 1232 movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp) 1233save_x0_not: 1234 #figure out the x4/x6 swapping state and save the correct one! 1235 cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4 1236 jae save_x4 1237 testq $1,%rdx #and even ones have r4 as well 1238 jz save_x4 1239 movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp) 1240 jmp debug_1024_go 1241save_x4: 1242 movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp) 1243debug_1024_go: 1244 #now all is saved in Xstk[] except for rdx 1245 push %rsi #save two regs for BLK_BITS-specific parms 1246 push %rdi 1247_SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32) 1248 1249 movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call) 1250 movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[] 1251 1252 movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr 1253 movq $1024,%rdi #rdi = block size 1254 jmp Skein_Debug_Round_Common 1255.endif 1256# 1257.if _SKEIN_CODE_SIZE 1258C_label Skein1024_Process_Block_CodeSize 1259 movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax 1260 ret 1261# 1262C_label Skein1024_Unroll_Cnt 1263 .if _UNROLL_CNT <> (ROUNDS_1024/8) 1264 movq $_UNROLL_CNT,%rax 1265 .else 1266 xorq %rax,%rax 1267 .endif 1268 ret 1269.endif 1270# 1271.endif # _USE_ASM_ and 1024 1272# 1273.if _SKEIN_DEBUG 1274#---------------------------------------------------------------- 1275#local debug routine to set up for calls to: 1276# void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X) 1277# [ rdi rsi rdx rcx] 1278# 1279# here with %rdx = round number 1280# %rsi = ctx_hdr_ptr 1281# %rdi = block size (256/512/1024) 1282# on stack: saved rdi, saved rsi, retAddr, saved rdx 1283# 1284Skein_Debug_Round_Common: 1285_SP_OFFS_ = 32 #account for four words on stack already 1286 .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs 1287 pushq %\_rr_ 1288_SP_OFFS_ = _SP_OFFS_+8 1289 .endr 1290 .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here 1291 .error "Debug_Round_Common: stack alignment" 1292 .endif 1293 # compute %rcx = ptr to the X[] array on the stack (final parameter to call) 1294 leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address 1295 cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"? 1296 jnz _got_rcxA 1297 leaq X_VARS(%rsi),%rcx 1298_got_rcxA: 1299 .if _USE_ASM_ & 1024 1300 # special handling for 1024-bit case 1301 # (for rounds right before with key injection: 1302 # use xDebug_1024[] instead of X_stk[]) 1303 cmpq $SKEIN_RND_SPECIAL,%rdx 1304 jae _got_rcxB #must be a normal round 1305 orq %rdx,%rdx 1306 jz _got_rcxB #just before key injection 1307 test $3,%rdx 1308 jne _got_rcxB 1309 cmp $1024,%rdi #only 1024-bit(s) for now 1310 jne _got_rcxB 1311 leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx 1312_got_rcxB: 1313 .endif 1314 call Skein_Show_Round #call external debug handler 1315 1316 .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs 1317 popq %\_rr_ 1318_SP_OFFS_ = _SP_OFFS_-8 1319 .endr 1320 .if _SP_OFFS_ - 32 1321 .error "Debug_Round_Common: push/pop misalignment!" 1322 .endif 1323 popq %rdi 1324 popq %rsi 1325 ret 1326.endif 1327#---------------------------------------------------------------- 1328 .section .note.GNU-stack,"",@progbits 1329 1330 .end 1331