skein_block_asm.s revision 312557
1#
2#----------------------------------------------------------------
3# 64-bit x86 assembler code (gnu as) for Skein block functions
4#
5# Author: Doug Whiting, Hifn/Exar
6#
7# This code is released to the public domain.
8#----------------------------------------------------------------
9#
10    .text
11    .altmacro
12    .psize 0,128                            #list file has no page boundaries
13#
14_MASK_ALL_  =  (256+512+1024)               #all three algorithm bits
15_MAX_FRAME_ =  240
16#
17#################
18.ifndef SKEIN_USE_ASM
19_USE_ASM_         = _MASK_ALL_
20.else
21_USE_ASM_         = SKEIN_USE_ASM
22.endif
23#################
24.ifndef SKEIN_LOOP                          #configure loop unrolling
25_SKEIN_LOOP       =   2                     #default is fully unrolled for 256/512, twice for 1024
26.else
27_SKEIN_LOOP       = SKEIN_LOOP
28  .irp _NN_,%_SKEIN_LOOP                #only display loop unrolling if default changed on command line
29.print  "+++ SKEIN_LOOP = \_NN_"
30  .endr
31.endif
32# the unroll counts (0 --> fully unrolled)
33SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) % 10
34SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) % 10
35SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) % 10
36#
37SKEIN_ASM_UNROLL  = 0
38  .irp _NN_,256,512,1024
39    .if (SKEIN_UNROLL_\_NN_) == 0
40SKEIN_ASM_UNROLL  = SKEIN_ASM_UNROLL + \_NN_
41    .endif
42  .endr
43#################
44#
45.ifndef SKEIN_ROUNDS
46ROUNDS_256  =   72
47ROUNDS_512  =   72
48ROUNDS_1024 =   80
49.else
50ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
51ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) % 10) + 5)
52ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) % 10) + 5)
53# only display rounds if default size is changed on command line
54.irp _NN_,256,512,1024
55  .if _USE_ASM_ && \_NN_
56    .irp _RR_,%(ROUNDS_\_NN_)
57      .if _NN_ < 1024
58.print  "+++ SKEIN_ROUNDS_\_NN_  = \_RR_"
59      .else
60.print  "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
61      .endif
62    .endr
63  .endif
64.endr
65.endif
66#################
67#
68.ifdef SKEIN_CODE_SIZE
69_SKEIN_CODE_SIZE = (1)
70.else
71.ifdef  SKEIN_PERF                           #use code size if SKEIN_PERF is defined
72_SKEIN_CODE_SIZE = (1)
73.else
74_SKEIN_CODE_SIZE = (0)
75.endif
76.endif
77#
78#################
79#
80.ifndef SKEIN_DEBUG
81_SKEIN_DEBUG      = 0
82.else
83_SKEIN_DEBUG      = 1
84.endif
85#################
86#
87# define offsets of fields in hash context structure
88#
89HASH_BITS   =   0                   #bits of hash output
90BCNT        =   8 + HASH_BITS       #number of bytes in BUFFER[]
91TWEAK       =   8 + BCNT            #tweak values[0..1]
92X_VARS      =  16 + TWEAK           #chaining vars
93#
94#(Note: buffer[] in context structure is NOT needed here :-)
95#
96KW_PARITY   =   0x1BD11BDAA9FC1A22  #overall parity of key schedule words
97FIRST_MASK  =   ~ (1 <<  6)
98FIRST_MASK64=   ~ (1 << 62)
99#
100# rotation constants for Skein
101#
102RC_256_0_0  = 14
103RC_256_0_1  = 16
104
105RC_256_1_0  = 52
106RC_256_1_1  = 57
107
108RC_256_2_0  = 23
109RC_256_2_1  = 40
110
111RC_256_3_0  =  5
112RC_256_3_1  = 37
113
114RC_256_4_0  = 25
115RC_256_4_1  = 33
116
117RC_256_5_0  = 46
118RC_256_5_1  = 12
119
120RC_256_6_0  = 58
121RC_256_6_1  = 22
122
123RC_256_7_0  = 32
124RC_256_7_1  = 32
125
126RC_512_0_0  = 46
127RC_512_0_1  = 36
128RC_512_0_2  = 19
129RC_512_0_3  = 37
130
131RC_512_1_0  = 33
132RC_512_1_1  = 27
133RC_512_1_2  = 14
134RC_512_1_3  = 42
135
136RC_512_2_0  = 17
137RC_512_2_1  = 49
138RC_512_2_2  = 36
139RC_512_2_3  = 39
140
141RC_512_3_0  = 44
142RC_512_3_1  =  9
143RC_512_3_2  = 54
144RC_512_3_3  = 56
145
146RC_512_4_0  = 39
147RC_512_4_1  = 30
148RC_512_4_2  = 34
149RC_512_4_3  = 24
150
151RC_512_5_0  = 13
152RC_512_5_1  = 50
153RC_512_5_2  = 10
154RC_512_5_3  = 17
155
156RC_512_6_0  = 25
157RC_512_6_1  = 29
158RC_512_6_2  = 39
159RC_512_6_3  = 43
160
161RC_512_7_0  =  8
162RC_512_7_1  = 35
163RC_512_7_2  = 56
164RC_512_7_3  = 22
165
166RC_1024_0_0 = 24
167RC_1024_0_1 = 13
168RC_1024_0_2 =  8
169RC_1024_0_3 = 47
170RC_1024_0_4 =  8
171RC_1024_0_5 = 17
172RC_1024_0_6 = 22
173RC_1024_0_7 = 37
174
175RC_1024_1_0 = 38
176RC_1024_1_1 = 19
177RC_1024_1_2 = 10
178RC_1024_1_3 = 55
179RC_1024_1_4 = 49
180RC_1024_1_5 = 18
181RC_1024_1_6 = 23
182RC_1024_1_7 = 52
183
184RC_1024_2_0 = 33
185RC_1024_2_1 =  4
186RC_1024_2_2 = 51
187RC_1024_2_3 = 13
188RC_1024_2_4 = 34
189RC_1024_2_5 = 41
190RC_1024_2_6 = 59
191RC_1024_2_7 = 17
192
193RC_1024_3_0 =  5
194RC_1024_3_1 = 20
195RC_1024_3_2 = 48
196RC_1024_3_3 = 41
197RC_1024_3_4 = 47
198RC_1024_3_5 = 28
199RC_1024_3_6 = 16
200RC_1024_3_7 = 25
201
202RC_1024_4_0 = 41
203RC_1024_4_1 =  9
204RC_1024_4_2 = 37
205RC_1024_4_3 = 31
206RC_1024_4_4 = 12
207RC_1024_4_5 = 47
208RC_1024_4_6 = 44
209RC_1024_4_7 = 30
210
211RC_1024_5_0 = 16
212RC_1024_5_1 = 34
213RC_1024_5_2 = 56
214RC_1024_5_3 = 51
215RC_1024_5_4 =  4
216RC_1024_5_5 = 53
217RC_1024_5_6 = 42
218RC_1024_5_7 = 41
219
220RC_1024_6_0 = 31
221RC_1024_6_1 = 44
222RC_1024_6_2 = 47
223RC_1024_6_3 = 46
224RC_1024_6_4 = 19
225RC_1024_6_5 = 42
226RC_1024_6_6 = 44
227RC_1024_6_7 = 25
228
229RC_1024_7_0 =  9
230RC_1024_7_1 = 48
231RC_1024_7_2 = 35
232RC_1024_7_3 = 52
233RC_1024_7_4 = 23
234RC_1024_7_5 = 31
235RC_1024_7_6 = 37
236RC_1024_7_7 = 20
237#
238#  Input:  reg
239# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
240#
241.macro RotL64   reg,BLK_SIZE,ROUND_NUM,MIX_NUM
242_RCNT_ = RC_\BLK_SIZE&_\ROUND_NUM&_\MIX_NUM
243  .if _RCNT_  #is there anything to do?
244    rolq    $_RCNT_,%\reg
245  .endif
246.endm
247#
248#----------------------------------------------------------------
249#
250# MACROS: define local vars and configure stack
251#
252#----------------------------------------------------------------
253# declare allocated space on the stack
254.macro StackVar localName,localSize
255\localName  =   _STK_OFFS_
256_STK_OFFS_  =   _STK_OFFS_+(\localSize)
257.endm #StackVar
258#
259#----------------------------------------------------------------
260#
261# MACRO: Configure stack frame, allocate local vars
262#
263.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
264    WCNT    =    (\BLK_BITS)/64
265#
266_PushCnt_   =   0                   #save nonvolatile regs on stack
267  .irp _reg_,rbp,rbx,r12,r13,r14,r15
268       pushq    %\_reg_
269_PushCnt_ = _PushCnt_ + 1           #track count to keep alignment
270  .endr
271#
272_STK_OFFS_  =   0                   #starting offset from rsp
273    #---- local  variables         #<-- rsp
274    StackVar    X_stk  ,8*(WCNT)    #local context vars
275    StackVar    ksTwk  ,8*3         #key schedule: tweak words
276    StackVar    ksKey  ,8*(WCNT)+8  #key schedule: key   words
277  .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0
278    StackVar    ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
279  .endif
280    StackVar    Wcopy  ,8*(WCNT)    #copy of input block
281  .if _SKEIN_DEBUG
282  .if \debugCnt + 0                 #temp location for debug X[] info
283    StackVar    xDebug_\BLK_BITS ,8*(\debugCnt)
284  .endif
285  .endif
286  .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
287    StackVar    align16,8           #keep 16-byte aligned (adjust for retAddr?)
288tmpStk_\BLK_BITS = align16          #use this
289  .endif
290    #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
291    StackVar    ctxPtr ,8           #context ptr
292    StackVar    blkPtr ,8           #pointer to block data
293    StackVar    blkCnt ,8           #number of full blocks to process
294    StackVar    bitAdd ,8           #bit count to add to tweak
295LOCAL_SIZE  =   _STK_OFFS_          #size of "local" vars
296    #----
297    StackVar    savRegs,8*_PushCnt_ #saved registers
298    StackVar    retAddr,8           #return address
299    #---- caller's stack frame (aligned mod 16)
300#
301# set up the stack frame pointer (rbp)
302#
303FRAME_OFFS  =   ksTwk + 128         #allow short (negative) offset to ksTwk, kwKey
304  .if FRAME_OFFS > _STK_OFFS_       #keep rbp in the "locals" range
305FRAME_OFFS  =      _STK_OFFS_
306  .endif
307F_O         =   -FRAME_OFFS
308#
309  #put some useful defines in the .lst file (for grep)
310__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
311__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
312__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
313#
314# Notes on stack frame setup:
315#   * the most frequently used variable is X_stk[], based at [rsp+0]
316#   * the next most used is the key schedule arrays, ksKey and ksTwk
317#       so rbp is "centered" there, allowing short offsets to the key
318#       schedule even in 1024-bit Skein case
319#   * the Wcopy variables are infrequently accessed, but they have long
320#       offsets from both rsp and rbp only in the 1024-bit case.
321#   * all other local vars and calling parameters can be accessed
322#       with short offsets, except in the 1024-bit case
323#
324    subq    $LOCAL_SIZE,%rsp        #make room for the locals
325    leaq    FRAME_OFFS(%rsp),%rbp   #maximize use of short offsets
326    movq    %rdi, ctxPtr+F_O(%rbp)  #save caller's parameters on the stack
327    movq    %rsi, blkPtr+F_O(%rbp)
328    movq    %rdx, blkCnt+F_O(%rbp)
329    movq    %rcx, bitAdd+F_O(%rbp)
330#
331.endm #Setup_Stack
332#
333#----------------------------------------------------------------
334#
335.macro Reset_Stack
336    addq    $LOCAL_SIZE,%rsp        #get rid of locals (wipe??)
337  .irp _reg_,r15,r14,r13,r12,rbx,rbp
338    popq    %\_reg_                 #restore caller's regs
339_PushCnt_ = _PushCnt_ - 1
340  .endr
341  .if _PushCnt_
342    .error  "Mismatched push/pops?"
343  .endif
344.endm # Reset_Stack
345#
346#----------------------------------------------------------------
347# macros to help debug internals
348#
349.if _SKEIN_DEBUG
350    .extern  Skein_Show_Block     #calls to C routines
351    .extern  Skein_Show_Round
352#
353SKEIN_RND_SPECIAL       =   1000
354SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
355SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
356SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
357#
358.macro Skein_Debug_Block BLK_BITS
359#
360#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
361#                     const u08b_t *blkPtr, const u64b_t *wPtr,
362#                     const u64b_t *ksPtr,const u64b_t *tsPtr)
363#
364_NN_ = 0
365  .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
366    pushq   %\_reg_                 #save all volatile regs on tack before the call
367_NN_ = _NN_ + 1
368  .endr
369    # get and push call parameters
370    movq    $\BLK_BITS      ,%rdi   #bits
371    movq    ctxPtr+F_O(%rbp),%rsi   #h (pointer)
372    leaq    X_VARS    (%rsi),%rdx   #X (pointer)
373    movq    blkPtr+F_O(%rbp),%rcx   #blkPtr
374    leaq    Wcopy +F_O(%rbp),%r8    #wPtr
375    leaq    ksKey +F_O(%rbp),%r9    #key pointer
376    leaq    ksTwk +F_O(%rbp),%rax   #tweak pointer
377    pushq   %rax                    #   (pass on the stack)
378    call    Skein_Show_Block        #call external debug handler
379    addq    $8*1,%rsp               #discard parameters on stack
380  .if (_NN_ % 2 ) == 0              #check stack alignment
381    .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
382  .endif
383  .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
384    popq    %\_reg_                 #restore regs
385_NN_ = _NN_ - 1
386  .endr
387  .if _NN_
388    .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
389  .endif
390.endm # Skein_Debug_Block
391#
392# the macro to "call" to debug a round
393#
394.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
395    # call the appropriate (local) debug "function"
396    pushq   %rdx                    #save rdx, so we can use it for round "number"
397  .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
398    movq    $\R,%rdx
399  .else                             #compute round number using edi
400_rOffs_ = \RDI_OFFS + 0
401   .if \BLK_BITS == 1024
402    movq    rIdx_offs+8(%rsp),%rdx  #get rIdx off the stack (adjust for pushq rdx above)
403    leaq    1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx
404   .else
405    leaq    1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx
406   .endif
407  .endif
408    call    Skein_Debug_Round_\BLK_BITS
409    popq    %rdx                    #restore origianl rdx value
410#
411    afterOp
412.endm  #  Skein_Debug_Round
413.else  #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
414.macro Skein_Debug_Block BLK_BITS
415.endm
416#
417.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
418.endm
419#
420.endif # _SKEIN_DEBUG
421#
422#----------------------------------------------------------------
423#
424.macro  addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
425  .if \immOffs + 0
426       leaq    \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
427  .elseif ((\useAddOp + 0) == 0)
428    .ifndef ASM_NO_LEA  #lea seems to be faster on Core 2 Duo CPUs!
429       leaq   (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
430    .else
431       addq    %\srcReg_A\srcReg_B,%\dstReg
432    .endif
433  .else
434       addq    %\srcReg_A\srcReg_B,%\dstReg
435  .endif
436.endm
437
438# keep Intel-style ordering here, to match addReg
439.macro  xorReg dstReg,srcReg_A,srcReg_B
440        xorq   %\srcReg_A\srcReg_B,%\dstReg
441.endm
442#
443#----------------------------------------------------------------
444#
445.macro C_label lName
446 \lName:        #use both "genders" to work across linkage conventions
447_\lName:
448    .global  \lName
449    .global _\lName
450.endm
451#
452#=================================== Skein_256 =============================================
453#
454.if _USE_ASM_ & 256
455#
456# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
457#
458#################
459#
460# code
461#
462C_label Skein_256_Process_Block
463    Setup_Stack 256,((ROUNDS_256/8)+1)
464    movq    TWEAK+8(%rdi),%r14
465    jmp     Skein_256_block_loop
466    .p2align 4
467    # main hash loop for Skein_256
468Skein_256_block_loop:
469    #
470    # general register usage:
471    #   RAX..RDX        = X0..X3
472    #   R08..R12        = ks[0..4]
473    #   R13..R15        = ts[0..2]
474    #   RSP, RBP        = stack/frame pointers
475    #   RDI             = round counter or context pointer
476    #   RSI             = temp
477    #
478    movq    TWEAK+0(%rdi)     ,%r13
479    addq    bitAdd+F_O(%rbp)  ,%r13  #computed updated tweak value T0
480    movq    %r14              ,%r15
481    xorq    %r13              ,%r15  #now %r13.%r15 is set as the tweak
482
483    movq    $KW_PARITY        ,%r12
484    movq       X_VARS+ 0(%rdi),%r8
485    movq       X_VARS+ 8(%rdi),%r9
486    movq       X_VARS+16(%rdi),%r10
487    movq       X_VARS+24(%rdi),%r11
488    movq    %r13,TWEAK+0(%rdi)       #save updated tweak value ctx->h.T[0]
489    xorq    %r8               ,%r12  #start accumulating overall parity
490
491    movq    blkPtr +F_O(%rbp) ,%rsi  #esi --> input block
492    xorq    %r9               ,%r12
493    movq     0(%rsi)          ,%rax  #get X[0..3]
494    xorq    %r10              ,%r12
495    movq     8(%rsi)          ,%rbx
496    xorq    %r11              ,%r12
497    movq    16(%rsi)          ,%rcx
498    movq    24(%rsi)          ,%rdx
499
500    movq    %rax,Wcopy+ 0+F_O(%rbp)  #save copy of input block
501    movq    %rbx,Wcopy+ 8+F_O(%rbp)
502    movq    %rcx,Wcopy+16+F_O(%rbp)
503    movq    %rdx,Wcopy+24+F_O(%rbp)
504
505    addq    %r8 ,%rax                #initial key injection
506    addq    %r9 ,%rbx
507    addq    %r10,%rcx
508    addq    %r11,%rdx
509    addq    %r13,%rbx
510    addq    %r14,%rcx
511
512.if _SKEIN_DEBUG
513    movq    %r14,TWEAK+ 8(%rdi)      #save updated tweak T[1] (start bit cleared?)
514    movq    %r8 ,ksKey+ 0+F_O(%rbp)  #save key schedule on stack for Skein_Debug_Block
515    movq    %r9 ,ksKey+ 8+F_O(%rbp)
516    movq    %r10,ksKey+16+F_O(%rbp)
517    movq    %r11,ksKey+24+F_O(%rbp)
518    movq    %r12,ksKey+32+F_O(%rbp)
519
520    movq    %r13,ksTwk+ 0+F_O(%rbp)
521    movq    %r14,ksTwk+ 8+F_O(%rbp)
522    movq    %r15,ksTwk+16+F_O(%rbp)
523
524    movq    %rax,X_stk + 0(%rsp)     #save X[] on stack for Skein_Debug_Block
525    movq    %rbx,X_stk + 8(%rsp)
526    movq    %rcx,X_stk +16(%rsp)
527    movq    %rdx,X_stk +24(%rsp)
528
529    Skein_Debug_Block 256            #debug dump
530    Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
531.endif
532#
533.if ((SKEIN_ASM_UNROLL & 256) == 0)
534    movq    %r8 ,ksKey+40+F_O(%rbp)  #save key schedule on stack for looping code
535    movq    %r9 ,ksKey+ 8+F_O(%rbp)
536    movq    %r10,ksKey+16+F_O(%rbp)
537    movq    %r11,ksKey+24+F_O(%rbp)
538    movq    %r12,ksKey+32+F_O(%rbp)
539
540    movq    %r13,ksTwk+24+F_O(%rbp)
541    movq    %r14,ksTwk+ 8+F_O(%rbp)
542    movq    %r15,ksTwk+16+F_O(%rbp)
543.endif
544    addq    $WCNT*8,%rsi             #skip the block
545    movq    %rsi,blkPtr  +F_O(%rbp)  #update block pointer
546    #
547    # now the key schedule is computed. Start the rounds
548    #
549.if SKEIN_ASM_UNROLL & 256
550_UNROLL_CNT =   ROUNDS_256/8
551.else
552_UNROLL_CNT =   SKEIN_UNROLL_256
553  .if ((ROUNDS_256/8) % _UNROLL_CNT)
554    .error "Invalid SKEIN_UNROLL_256"
555  .endif
556    xorq    %rdi,%rdi                #rdi = iteration count
557Skein_256_round_loop:
558.endif
559_Rbase_ = 0
560.rept _UNROLL_CNT*2
561    # all X and ks vars in regs      # (ops to "rotate" ks vars, via mem, if not unrolled)
562    # round 4*_RBase_ + 0
563    addReg  rax, rbx
564    RotL64  rbx, 256,%((4*_Rbase_+0) % 8),0
565    addReg  rcx, rdx
566                .if (SKEIN_ASM_UNROLL & 256) == 0
567                    movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
568                .endif
569    xorReg  rbx, rax
570    RotL64  rdx, 256,%((4*_Rbase_+0) % 8),1
571    xorReg  rdx, rcx
572  .if SKEIN_ASM_UNROLL & 256
573    .irp _r0_,%( 8+(_Rbase_+3) % 5)
574    .irp _r1_,%(13+(_Rbase_+2) % 3)
575      leaq   (%r\_r0_,%r\_r1_),%rdi    #precompute key injection value for %rcx
576    .endr
577    .endr
578  .endif
579                .if (SKEIN_ASM_UNROLL & 256) == 0
580                    movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
581                .endif
582    Skein_Debug_Round 256,%(4*_Rbase_+1)
583
584    # round 4*_Rbase_ + 1
585    addReg  rax, rdx
586    RotL64  rdx, 256,%((4*_Rbase_+1) % 8),0
587    xorReg  rdx, rax
588                .if (SKEIN_ASM_UNROLL & 256) == 0
589                    movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
590                .endif
591    addReg  rcx, rbx
592    RotL64  rbx, 256,%((4*_Rbase_+1) % 8),1
593    xorReg  rbx, rcx
594                .if (SKEIN_ASM_UNROLL & 256) == 0
595                    movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
596                .endif
597    Skein_Debug_Round 256,%(4*_Rbase_+2)
598 .if SKEIN_ASM_UNROLL & 256
599    .irp _r0_,%( 8+(_Rbase_+2) % 5)
600    .irp _r1_,%(13+(_Rbase_+1) % 3)
601      leaq   (%r\_r0_,%r\_r1_),%rsi     #precompute key injection value for %rbx
602    .endr
603    .endr
604 .endif
605    # round 4*_Rbase_ + 2
606    addReg  rax, rbx
607    RotL64  rbx, 256,%((4*_Rbase_+2) % 8),0
608    addReg  rcx, rdx
609                .if (SKEIN_ASM_UNROLL & 256) == 0
610                    movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
611                .endif
612    xorReg  rbx, rax
613    RotL64  rdx, 256,%((4*_Rbase_+2) % 8),1
614    xorReg  rdx, rcx
615                .if (SKEIN_ASM_UNROLL & 256) == 0
616                    movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8)  #"rotate" the key
617                    leaq 1(%r11,%rdi),%r11               #precompute key + tweak
618                .endif
619    Skein_Debug_Round 256,%(4*_Rbase_+3)
620    # round 4*_Rbase_ + 3
621    addReg  rax, rdx
622    RotL64  rdx, 256,%((4*_Rbase_+3) % 8),0
623    addReg  rcx, rbx
624                .if (SKEIN_ASM_UNROLL & 256) == 0
625                    addq      ksTwk+8*2+F_O(%rbp,%rdi,8),%r10  #precompute key + tweak
626                    movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8)       #"rotate" the tweak
627                .endif
628    xorReg  rdx, rax
629    RotL64  rbx, 256,%((4*_Rbase_+3) % 8),1
630    xorReg  rbx, rcx
631    Skein_Debug_Round 256,%(4*_Rbase_+4)
632                .if (SKEIN_ASM_UNROLL & 256) == 0
633                    addReg r9 ,r13           #precompute key+tweak
634                .endif
635      #inject key schedule words
636_Rbase_ = _Rbase_+1
637  .if SKEIN_ASM_UNROLL & 256
638    addReg    rax,r,%(8+((_Rbase_+0) % 5))
639    addReg    rbx,rsi
640    addReg    rcx,rdi
641    addReg    rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
642  .else
643    incq      %rdi
644    addReg    rax,r8
645    addReg    rcx,r10
646    addReg    rbx,r9
647    addReg    rdx,r11
648  .endif
649    Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
650.endr #rept _UNROLL_CNT
651#
652.if (SKEIN_ASM_UNROLL & 256) == 0
653    cmpq    $2*(ROUNDS_256/8),%rdi
654    jb      Skein_256_round_loop
655.endif # (SKEIN_ASM_UNROLL & 256) == 0
656    movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
657
658    #----------------------------
659    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
660    movq    $FIRST_MASK64 ,%r14
661    xorq    Wcopy + 0+F_O (%rbp),%rax
662    xorq    Wcopy + 8+F_O (%rbp),%rbx
663    xorq    Wcopy +16+F_O (%rbp),%rcx
664    xorq    Wcopy +24+F_O (%rbp),%rdx
665    andq    TWEAK + 8     (%rdi),%r14
666    movq    %rax,X_VARS+ 0(%rdi)             #store final result
667    movq    %rbx,X_VARS+ 8(%rdi)
668    movq    %rcx,X_VARS+16(%rdi)
669    movq    %rdx,X_VARS+24(%rdi)
670
671    Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
672
673    # go back for more blocks, if needed
674    decq    blkCnt+F_O(%rbp)
675    jnz     Skein_256_block_loop
676    movq    %r14,TWEAK + 8(%rdi)
677    Reset_Stack
678    ret
679Skein_256_Process_Block_End:
680
681  .if _SKEIN_DEBUG
682Skein_Debug_Round_256:               #here with rdx == round "number" from macro
683    pushq   %rsi                     #save two regs for BLK_BITS-specific parms
684    pushq   %rdi
685    movq    24(%rsp),%rdi            #get back original rdx (pushed on stack in macro call) to rdi
686    movq    %rax,X_stk+ 0+F_O(%rbp)  #save X[] state on stack so debug routines can access it
687    movq    %rbx,X_stk+ 8+F_O(%rbp)  #(use FP_ since rsp has changed!)
688    movq    %rcx,X_stk+16+F_O(%rbp)
689    movq    %rdi,X_stk+24+F_O(%rbp)
690
691    movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
692    movq    $256,%rdi                #now <rdi,rsi,rdx> are set for the call
693    jmp     Skein_Debug_Round_Common
694  .endif
695#
696.if _SKEIN_CODE_SIZE
697C_label  Skein_256_Process_Block_CodeSize
698    movq    $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
699    ret
700#
701C_label Skein_256_Unroll_Cnt
702  .if _UNROLL_CNT <> ROUNDS_256/8
703    movq    $_UNROLL_CNT,%rax
704  .else
705    xorq    %rax,%rax
706  .endif
707    ret
708.endif
709#
710.endif #_USE_ASM_ & 256
711#
712#=================================== Skein_512 =============================================
713#
714.if _USE_ASM_ & 512
715#
716# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
717#
718# X[i] == %r[8+i]          #register assignments for X[] values during rounds (i=0..7)
719#
720#################
721# MACRO: one round for 512-bit blocks
722#
723.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
724#
725    addReg      r\rn0, r\rn1
726    RotL64      r\rn1, 512,%((_Rn_) % 8),0
727    xorReg      r\rn1, r\rn0
728            op1
729    addReg      r\rn2, r\rn3
730    RotL64      r\rn3, 512,%((_Rn_) % 8),1
731    xorReg      r\rn3, r\rn2
732            op2
733    addReg      r\rn4, r\rn5
734    RotL64      r\rn5, 512,%((_Rn_) % 8),2
735    xorReg      r\rn5, r\rn4
736            op3
737    addReg      r\rn6, r\rn7
738    RotL64      r\rn7, 512,%((_Rn_) % 8),3
739    xorReg      r\rn7, r\rn6
740            op4
741    Skein_Debug_Round 512,%(_Rn_+1),-4
742#
743.endm #R_512_OneRound
744#
745#################
746# MACRO: eight rounds for 512-bit blocks
747#
748.macro R_512_FourRounds _RR_    #RR = base round number (0 % 8)
749  .if (SKEIN_ASM_UNROLL && 512)
750    # here for fully unrolled case.
751    _II_ = ((_RR_)/4) + 1       #key injection counter
752    R_512_OneRound  8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
753    R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
754    R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
755    R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
756    # inject the key schedule
757    addq    ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
758    addReg   r11, rax
759    addq    ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
760    addReg   r12, rbx
761    addq    ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
762    addReg   r13, rcx
763    addReg   r14, rdx
764    addReg   r15, rsi,,,(_II_)
765  .else
766    # here for looping case                                                    #"rotate" key/tweak schedule (move up on stack)
767    incq    %rdi                 #bump key injection counter
768    R_512_OneRound  8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq      ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq      ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
769    R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8)     >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
770    R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq      ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq      ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
771    R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq      ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
772    # inject the key schedule
773    addq    ksKey+8*0+F_O(%rbp,%rdi,8),%r8
774    addReg   r11, rax
775    addReg   r12, rbx
776    addq    ksKey+8*1+F_O(%rbp,%rdi,8),%r9
777    addReg   r13, rcx
778    addReg   r14, rdx
779    addq    ksKey+8*2+F_O(%rbp,%rdi,8),%r10
780    addReg   r15, rsi
781    addReg   r15, rdi              #inject the round number
782  .endif
783
784    #show the result of the key injection
785    Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
786.endm #R_512_EightRounds
787#
788#################
789# instantiated code
790#
791C_label Skein_512_Process_Block
792    Setup_Stack 512,ROUNDS_512/8
793    movq    TWEAK+ 8(%rdi),%rbx
794    jmp     Skein_512_block_loop
795    .p2align 4
796    # main hash loop for Skein_512
797Skein_512_block_loop:
798    # general register usage:
799    #   RAX..RDX       = temps for key schedule pre-loads
800    #   R8 ..R15       = X0..X7
801    #   RSP, RBP       = stack/frame pointers
802    #   RDI            = round counter or context pointer
803    #   RSI            = temp
804    #
805    movq    TWEAK +  0(%rdi),%rax
806    addq    bitAdd+F_O(%rbp),%rax     #computed updated tweak value T0
807    movq    %rbx,%rcx
808    xorq    %rax,%rcx                 #%rax/%rbx/%rcx = tweak schedule
809    movq    %rax,TWEAK+ 0    (%rdi)   #save updated tweak value ctx->h.T[0]
810    movq    %rax,ksTwk+ 0+F_O(%rbp)
811    movq    $KW_PARITY,%rdx
812    movq    blkPtr +F_O(%rbp),%rsi    #%rsi --> input block
813    movq    %rbx,ksTwk+ 8+F_O(%rbp)
814    movq    %rcx,ksTwk+16+F_O(%rbp)
815    .irp _Rn_,8,9,10,11,12,13,14,15
816      movq  X_VARS+8*(_Rn_-8)(%rdi),%r\_Rn_
817      xorq  %r\_Rn_,%rdx              #compute overall parity
818      movq  %r\_Rn_,ksKey+8*(_Rn_-8)+F_O(%rbp)
819    .endr                             #load state into %r8 ..%r15, compute parity
820      movq  %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
821
822    addReg   r13,rax                  #precompute key injection for tweak
823    addReg   r14, rbx
824.if _SKEIN_DEBUG
825    movq    %rbx,TWEAK+ 8(%rdi)       #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
826.endif
827    movq     0(%rsi),%rax             #load input block
828    movq     8(%rsi),%rbx
829    movq    16(%rsi),%rcx
830    movq    24(%rsi),%rdx
831    addReg   r8 , rax                 #do initial key injection
832    addReg   r9 , rbx
833    movq    %rax,Wcopy+ 0+F_O(%rbp)   #keep local copy for feedforward
834    movq    %rbx,Wcopy+ 8+F_O(%rbp)
835    addReg   r10, rcx
836    addReg   r11, rdx
837    movq    %rcx,Wcopy+16+F_O(%rbp)
838    movq    %rdx,Wcopy+24+F_O(%rbp)
839
840    movq    32(%rsi),%rax
841    movq    40(%rsi),%rbx
842    movq    48(%rsi),%rcx
843    movq    56(%rsi),%rdx
844    addReg   r12, rax
845    addReg   r13, rbx
846    addReg   r14, rcx
847    addReg   r15, rdx
848    movq    %rax,Wcopy+32+F_O(%rbp)
849    movq    %rbx,Wcopy+40+F_O(%rbp)
850    movq    %rcx,Wcopy+48+F_O(%rbp)
851    movq    %rdx,Wcopy+56+F_O(%rbp)
852
853.if _SKEIN_DEBUG
854    .irp _Rn_,8,9,10,11,12,13,14,15   #save values on stack for debug output
855      movq  %r\_Rn_,X_stk+8*(_Rn_-8)(%rsp)
856    .endr
857
858    Skein_Debug_Block 512             #debug dump
859    Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
860.endif
861    addq    $8*WCNT,%rsi              #skip the block
862    movq    %rsi,blkPtr+F_O(%rbp)     #update block pointer
863    #
864    #################
865    # now the key schedule is computed. Start the rounds
866    #
867.if SKEIN_ASM_UNROLL & 512
868_UNROLL_CNT =   ROUNDS_512/8
869.else
870_UNROLL_CNT =   SKEIN_UNROLL_512
871  .if ((ROUNDS_512/8) % _UNROLL_CNT)
872    .err "Invalid SKEIN_UNROLL_512"
873  .endif
874    xorq    %rdi,%rdi                 #rdi = round counter
875Skein_512_round_loop:
876.endif
877#
878_Rbase_ = 0
879.rept _UNROLL_CNT*2
880      R_512_FourRounds %(4*_Rbase_+00)
881_Rbase_ = _Rbase_+1
882.endr #rept _UNROLL_CNT
883#
884.if (SKEIN_ASM_UNROLL & 512) == 0
885    cmpq    $2*(ROUNDS_512/8),%rdi
886    jb      Skein_512_round_loop
887    movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
888.endif
889    # end of rounds
890    #################
891    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
892    .irp _Rn_,8,9,10,11,12,13,14,15
893  .if (_Rn_ == 8)
894    movq    $FIRST_MASK64,%rbx
895  .endif
896      xorq  Wcopy+8*(_Rn_-8)+F_O(%rbp),%r\_Rn_  #feedforward XOR
897      movq  %r\_Rn_,X_VARS+8*(_Rn_-8)(%rdi)     #and store result
898  .if (_Rn_ == 14)
899    andq    TWEAK+ 8(%rdi),%rbx
900  .endif
901    .endr
902    Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
903
904    # go back for more blocks, if needed
905    decq    blkCnt+F_O(%rbp)
906    jnz     Skein_512_block_loop
907    movq    %rbx,TWEAK + 8(%rdi)
908
909    Reset_Stack
910    ret
911Skein_512_Process_Block_End:
912#
913  .if _SKEIN_DEBUG
914# call here with rdx  = "round number"
915Skein_Debug_Round_512:
916    pushq   %rsi                     #save two regs for BLK_BITS-specific parms
917    pushq   %rdi
918  .irp _Rn_,8,9,10,11,12,13,14,15    #save X[] state on stack so debug routines can access it
919    movq    %r\_Rn_,X_stk+8*(_Rn_-8)+F_O(%rbp)
920  .endr
921    movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
922    movq    $512,%rdi                #now <rdi,rsi,rdx> are set for the call
923    jmp     Skein_Debug_Round_Common
924  .endif
925#
926.if _SKEIN_CODE_SIZE
927C_label Skein_512_Process_Block_CodeSize
928    movq    $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
929    ret
930#
931C_label Skein_512_Unroll_Cnt
932  .if _UNROLL_CNT <> (ROUNDS_512/8)
933    movq    $_UNROLL_CNT,%rax
934  .else
935    xorq    %rax,%rax
936  .endif
937    ret
938.endif
939#
940.endif # _USE_ASM_ & 512
941#
942#=================================== Skein1024 =============================================
943.if _USE_ASM_ & 1024
944#
945# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
946#
947#################
948# use details of permutation to make register assignments
949#
950o1K_rdi =  0        #offsets in X[] associated with each register
951o1K_rsi =  1
952o1K_rbp =  2
953o1K_rax =  3
954o1K_rcx =  4        #rcx is "shared" with X6, since X4/X6 alternate
955o1K_rbx =  5
956o1K_rdx =  7
957o1K_r8  =  8
958o1K_r9  =  9
959o1K_r10 = 10
960o1K_r11 = 11
961o1K_r12 = 12
962o1K_r13 = 13
963o1K_r14 = 14
964o1K_r15 = 15
965#
966rIdx_offs = tmpStk_1024
967#
968.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
969    addReg      \reg0 , \reg1                      #perform the MIX
970    RotL64      \reg1 , 1024,%((_RN0_) % 8),_Rn1_
971    xorReg      \reg1 , \reg0
972.if ((_RN0_) && 3) == 3         #time to do key injection?
973 .if _SKEIN_DEBUG
974    movq       %\reg0 , xDebug_1024+8*w0(%rsp)     #save intermediate values for Debug_Round
975    movq       %\reg1 , xDebug_1024+8*w1(%rsp)     # (before inline key injection)
976 .endif
977_II_ = ((_RN0_)/4)+1            #injection count
978 .if SKEIN_ASM_UNROLL && 1024   #here to do fully unrolled key injection
979    addq        ksKey+ 8*((_II_+w0) % 17)(%rsp),%\reg0
980    addq        ksKey+ 8*((_II_+w1) % 17)(%rsp),%\reg1
981  .if     w1 == 13                                 #tweak injection
982    addq        ksTwk+ 8*((_II_+ 0) %  3)(%rsp),%\reg1
983  .elseif w0 == 14
984    addq        ksTwk+ 8*((_II_+ 1) %  3)(%rsp),%\reg0
985  .elseif w1 == 15
986    addq        $_II_, %\reg1                      #(injection counter)
987  .endif
988 .else                          #here to do looping  key injection
989  .if  (w0 == 0)
990    movq        %rdi, X_stk+8*w0(%rsp)             #if so, store N0 so we can use reg as index
991    movq         rIdx_offs(%rsp),%rdi              #get the injection counter index into rdi
992  .else
993    addq         ksKey+8+8*w0(%rsp,%rdi,8),%\reg0  #even key injection
994  .endif
995  .if     w1 == 13                                 #tweak injection
996    addq         ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1
997  .elseif w0 == 14
998    addq         ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0
999  .elseif w1 == 15
1000    addReg      \reg1,rdi,,,1                      #(injection counter)
1001  .endif
1002    addq         ksKey+8+8*w1(%rsp,%rdi,8),%\reg1  #odd key injection
1003 .endif
1004.endif
1005    # insert the op provided, .if any
1006    op1
1007.endm
1008#################
1009# MACRO: four rounds for 1024-bit blocks
1010#
1011.macro r1024_FourRounds _RR_    #RR = base round number (0 mod 4)
1012    # should be here with X4 set properly, X6 stored on stack
1013_Rn_ = (_RR_) + 0
1014        r1024_Mix  0, 1,rdi,rsi,_Rn_,0
1015        r1024_Mix  2, 3,rbp,rax,_Rn_,1
1016        r1024_Mix  4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
1017        r1024_Mix  8, 9,r8 ,r9 ,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack
1018        r1024_Mix 10,11,r10,r11,_Rn_,5
1019        r1024_Mix 12,13,r12,r13,_Rn_,6
1020        r1024_Mix  6, 7,rcx,rdx,_Rn_,3
1021        r1024_Mix 14,15,r14,r15,_Rn_,7
1022    .if _SKEIN_DEBUG
1023      Skein_Debug_Round 1024,%(_Rn_+1)
1024    .endif
1025_Rn_ = (_RR_) + 1
1026        r1024_Mix  0, 9,rdi,r9 ,_Rn_,0
1027        r1024_Mix  2,13,rbp,r13,_Rn_,1
1028        r1024_Mix  6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
1029        r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack
1030        r1024_Mix 12, 3,r12,rax,_Rn_,5
1031        r1024_Mix 14, 5,r14,rbx,_Rn_,6
1032        r1024_Mix  4,15,rcx,r15,_Rn_,3
1033        r1024_Mix  8, 1,r8 ,rsi,_Rn_,7
1034    .if _SKEIN_DEBUG
1035      Skein_Debug_Round 1024,%(_Rn_+1)
1036    .endif
1037_Rn_ = (_RR_) + 2
1038        r1024_Mix  0, 7,rdi,rdx,_Rn_,0
1039        r1024_Mix  2, 5,rbp,rbx,_Rn_,1
1040        r1024_Mix  4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
1041        r1024_Mix 12,15,r12,r15,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack
1042        r1024_Mix 14,13,r14,r13,_Rn_,5
1043        r1024_Mix  8,11,r8 ,r11,_Rn_,6
1044        r1024_Mix  6, 1,rcx,rsi,_Rn_,3
1045        r1024_Mix 10, 9,r10,r9 ,_Rn_,7
1046    .if _SKEIN_DEBUG
1047      Skein_Debug_Round 1024,%(_Rn_+1)
1048    .endif
1049_Rn_ = (_RR_) + 3
1050        r1024_Mix  0,15,rdi,r15,_Rn_,0
1051        r1024_Mix  2,11,rbp,r11,_Rn_,1
1052        r1024_Mix  6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
1053        r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack
1054        r1024_Mix  8, 5,r8 ,rbx,_Rn_,5
1055        r1024_Mix 10, 3,r10,rax,_Rn_,6
1056        r1024_Mix  4, 9,rcx,r9 ,_Rn_,3
1057        r1024_Mix 12, 7,r12,rdx,_Rn_,7
1058    .if _SKEIN_DEBUG
1059      Skein_Debug_Round 1024,%(_Rn_+1)
1060    .endif
1061
1062  .if (SKEIN_ASM_UNROLL && 1024) == 0           #here with rdi == rIdx, X0 on stack
1063    #"rotate" the key schedule on the stack
1064i8 = o1K_r8
1065i0 = o1K_rdi
1066    movq    %r8 , X_stk+8*i8(%rsp)              #free up a register (save it on the stack)
1067    movq          ksKey+8* 0(%rsp,%rdi,8),%r8   #get  key  word
1068    movq    %r8 , ksKey+8*17(%rsp,%rdi,8)       #rotate key (must do key first or tweak clobbers it!)
1069    movq          ksTwk+8* 0(%rsp,%rdi,8),%r8   #get tweak word
1070    movq    %r8 , ksTwk+8* 3(%rsp,%rdi,8)       #rotate tweak (onto the stack)
1071    movq          X_stk+8*i8(%rsp)       ,%r8   #get the reg back
1072    incq    %rdi                                #bump the index
1073    movq    %rdi, rIdx_offs (%rsp)              #save rdi again
1074    movq          ksKey+8*i0(%rsp,%rdi,8),%rdi  #get the key schedule word for X0 back
1075    addq          X_stk+8*i0(%rsp)       ,%rdi  #perform the X0 key injection
1076  .endif
1077    #show the result of the key injection
1078    Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
1079.endm #r1024_FourRounds
1080#
1081################
1082# code
1083#
1084C_label Skein1024_Process_Block
1085#
1086    Setup_Stack 1024,ROUNDS_1024/8,WCNT
1087    movq    TWEAK+ 8(%rdi),%r9
1088    jmp     Skein1024_block_loop
1089    # main hash loop for Skein1024
1090    .p2align 4
1091Skein1024_block_loop:
1092    # general register usage:
1093    #   RSP              = stack pointer
1094    #   RAX..RDX,RSI,RDI = X1, X3..X7 (state words)
1095    #   R8 ..R15         = X8..X15    (state words)
1096    #   RBP              = temp (used for X0 and X2)
1097    #
1098  .if (SKEIN_ASM_UNROLL & 1024) == 0
1099    xorq    %rax,%rax                      #init loop index on the stack
1100    movq    %rax,rIdx_offs(%rsp)
1101  .endif
1102    movq         TWEAK+     0(%rdi),%r8
1103    addq         bitAdd+  F_O(%rbp),%r8    #computed updated tweak value T0
1104    movq    %r9 ,%r10
1105    xorq    %r8 ,%r10                      #%rax/%rbx/%rcx = tweak schedule
1106    movq    %r8 ,TWEAK+     0(%rdi)        #save updated tweak value ctx->h.T[0]
1107    movq    %r8 ,ksTwk+ 0+F_O(%rbp)
1108    movq    %r9 ,ksTwk+ 8+F_O(%rbp)        #keep values in %r8 ,%r9  for initial tweak injection below
1109    movq    %r10,ksTwk+16+F_O(%rbp)
1110  .if _SKEIN_DEBUG
1111    movq    %r9 ,TWEAK+     8(%rdi)        #save updated tweak value ctx->h.T[1] for Skein_Debug_Block
1112  .endif
1113    movq         blkPtr +F_O(%rbp),%rsi    # rsi --> input block
1114    movq        $KW_PARITY        ,%rax    #overall key schedule parity
1115
1116    # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
1117    .irp _rN_,0,1,2,3,4,6                  #process the "initial" words, using r14/r15 as temps
1118      movq       X_VARS+8*_rN_(%rdi),%r14  #get state word
1119      movq              8*_rN_(%rsi),%r15  #get msg   word
1120      xorq  %r14,%rax                      #update key schedule overall parity
1121      movq  %r14,ksKey +8*_rN_+F_O(%rbp)   #save key schedule word on stack
1122      movq  %r15,Wcopy +8*_rN_+F_O(%rbp)   #save local msg Wcopy
1123      addq  %r15,%r14                      #do the initial key injection
1124      movq  %r14,X_stk +8*_rN_    (%rsp)   #save initial state var on stack
1125    .endr
1126    # now process the rest, using the "real" registers
1127    #     (MUST do it in reverse order to inject tweaks r8/r9 first)
1128    .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx
1129_oo_ = o1K_\_rr_                           #offset assocated with the register
1130      movq  X_VARS+8*_oo_(%rdi),%\_rr_     #get key schedule word from context
1131      movq         8*_oo_(%rsi),%rcx       #get next input msg word
1132      movq  %\_rr_, ksKey +8*_oo_(%rsp)    #save key schedule on stack
1133      xorq  %\_rr_, %rax                   #accumulate key schedule parity
1134      movq  %rcx,Wcopy+8*_oo_+F_O(%rbp)    #save copy of msg word for feedforward
1135      addq  %rcx,%\_rr_                    #do the initial  key  injection
1136      .if    _oo_ == 13                    #do the initial tweak injection
1137        addReg _rr_,r8                     #          (only in words 13/14)
1138      .elseif _oo_ == 14
1139        addReg _rr_,r9
1140      .endif
1141    .endr
1142    movq    %rax,ksKey+8*WCNT+F_O(%rbp)    #save key schedule parity
1143.if _SKEIN_DEBUG
1144    Skein_Debug_Block 1024                 #initial debug dump
1145.endif
1146    addq     $8*WCNT,%rsi                  #bump the msg ptr
1147    movq     %rsi,blkPtr+F_O(%rbp)         #save bumped msg ptr
1148    # re-load words 0..4 from stack, enter the main loop
1149    .irp _rr_,rdi,rsi,rbp,rax,rcx          #(no need to re-load x6, already on stack)
1150      movq  X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!
1151    .endr
1152.if _SKEIN_DEBUG
1153    Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL        #show state after initial key injection
1154.endif
1155    #
1156    #################
1157    # now the key schedule is computed. Start the rounds
1158    #
1159.if SKEIN_ASM_UNROLL & 1024
1160_UNROLL_CNT =   ROUNDS_1024/8
1161.else
1162_UNROLL_CNT =   SKEIN_UNROLL_1024
1163  .if ((ROUNDS_1024/8) % _UNROLL_CNT)
1164    .error "Invalid SKEIN_UNROLL_1024"
1165  .endif
1166Skein1024_round_loop:
1167.endif
1168#
1169_Rbase_ = 0
1170.rept _UNROLL_CNT*2                        #implement the rounds, 4 at a time
1171      r1024_FourRounds %(4*_Rbase_+00)
1172_Rbase_ = _Rbase_+1
1173.endr #rept _UNROLL_CNT
1174#
1175.if (SKEIN_ASM_UNROLL & 1024) == 0
1176    cmpq    $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done
1177    jb      Skein1024_round_loop
1178.endif
1179    # end of rounds
1180    #################
1181    #
1182    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..15}
1183    movq    %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack
1184    movq       ctxPtr(%rsp),%rdx
1185
1186    .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15   #do all but x6,x7
1187_oo_ = o1K_\_rr_
1188      xorq  Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR
1189      movq  %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context
1190      .if (_oo_ ==  9)
1191        movq   $FIRST_MASK64 ,%r9
1192      .endif
1193      .if (_oo_ == 14)
1194        andq   TWEAK+ 8(%rdx),%r9
1195      .endif
1196    .endr
1197    #
1198    movq         X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)
1199    movq         X_stk +8*7(%rsp),%rbx
1200    xorq         Wcopy +8*6(%rsp),%rax
1201    xorq         Wcopy +8*7(%rsp),%rbx
1202    movq    %rax,X_VARS+8*6(%rdx)
1203    decq             blkCnt(%rsp)      #set zero flag iff done
1204    movq    %rbx,X_VARS+8*7(%rdx)
1205
1206    Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>
1207    # go back for more blocks, if needed
1208    movq             ctxPtr(%rsp),%rdi #don't muck with the flags here!
1209    lea          FRAME_OFFS(%rsp),%rbp
1210    jnz     Skein1024_block_loop
1211    movq    %r9 ,TWEAK+   8(%rdx)
1212    Reset_Stack
1213    ret
1214#
1215Skein1024_Process_Block_End:
1216#
1217.if _SKEIN_DEBUG
1218Skein_Debug_Round_1024:
1219    # call here with rdx  = "round number",
1220_SP_OFFS_ = 8*2                     #stack "offset" here: rdx, return addr
1221    #
1222  #save rest of X[] state on stack so debug routines can access it
1223  .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15
1224    movq    %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)
1225  .endr
1226    # Figure out what to do with x0 (rdi).  When rdx == 0 mod 4, it's already on stack
1227    cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always save
1228    jae     save_x0
1229    testq   $3,%rdx                 #otherwise only if rdx != 0 mod 4
1230    jz      save_x0_not
1231save_x0:
1232    movq    %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)
1233save_x0_not:
1234    #figure out the x4/x6 swapping state and save the correct one!
1235    cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4
1236    jae     save_x4
1237    testq   $1,%rdx                  #and even ones have r4 as well
1238    jz      save_x4
1239    movq    %rcx,X_stk+8*6+_SP_OFFS_(%rsp)
1240    jmp     debug_1024_go
1241save_x4:
1242    movq    %rcx,X_stk+8*4+_SP_OFFS_(%rsp)
1243debug_1024_go:
1244    #now all is saved in Xstk[] except for rdx
1245    push    %rsi                    #save two regs for BLK_BITS-specific parms
1246    push    %rdi
1247_SP_OFFS_ = _SP_OFFS_ + 16          #adjust stack offset accordingly (now 32)
1248
1249    movq    _SP_OFFS_-8(%rsp),%rsi  #get back original %rdx (pushed on stack in macro call)
1250    movq    %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]
1251
1252    movq    ctxPtr+_SP_OFFS_(%rsp),%rsi  #rsi = ctx_hdr_ptr
1253    movq    $1024,%rdi                   #rdi = block size
1254    jmp     Skein_Debug_Round_Common
1255.endif
1256#
1257.if _SKEIN_CODE_SIZE
1258C_label Skein1024_Process_Block_CodeSize
1259    movq    $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax
1260    ret
1261#
1262C_label Skein1024_Unroll_Cnt
1263  .if _UNROLL_CNT <> (ROUNDS_1024/8)
1264    movq    $_UNROLL_CNT,%rax
1265  .else
1266    xorq    %rax,%rax
1267  .endif
1268    ret
1269.endif
1270#
1271.endif # _USE_ASM_ and 1024
1272#
1273.if _SKEIN_DEBUG
1274#----------------------------------------------------------------
1275#local debug routine to set up for calls to:
1276#  void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)
1277#                       [       rdi                        rsi   rdx              rcx]
1278#
1279# here with %rdx = round number
1280#           %rsi = ctx_hdr_ptr
1281#           %rdi = block size (256/512/1024)
1282# on stack: saved rdi, saved rsi, retAddr, saved rdx
1283#
1284Skein_Debug_Round_Common:
1285_SP_OFFS_ = 32                        #account for four words on stack already
1286  .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15  #save the rest of the regs
1287    pushq %\_rr_
1288_SP_OFFS_ = _SP_OFFS_+8
1289  .endr
1290  .if (_SP_OFFS_ % 16)                # make sure stack is still 16-byte aligned here
1291    .error  "Debug_Round_Common: stack alignment"
1292  .endif
1293    # compute %rcx  = ptr to the X[] array on the stack (final parameter to call)
1294    leaq    X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address
1295    cmpq    $SKEIN_RND_FEED_FWD,%rdx   #special handling for feedforward "round"?
1296    jnz     _got_rcxA
1297    leaq    X_VARS(%rsi),%rcx
1298_got_rcxA:
1299  .if _USE_ASM_ & 1024
1300    # special handling for 1024-bit case
1301    #    (for rounds right before with key injection:
1302    #        use xDebug_1024[] instead of X_stk[])
1303    cmpq    $SKEIN_RND_SPECIAL,%rdx
1304    jae     _got_rcxB               #must be a normal round
1305    orq     %rdx,%rdx
1306    jz      _got_rcxB               #just before key injection
1307    test    $3,%rdx
1308    jne     _got_rcxB
1309    cmp     $1024,%rdi              #only 1024-bit(s) for now
1310    jne     _got_rcxB
1311    leaq    xDebug_1024+_SP_OFFS_(%rsp),%rcx
1312_got_rcxB:
1313  .endif
1314    call    Skein_Show_Round        #call external debug handler
1315
1316  .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax  #restore regs
1317    popq  %\_rr_
1318_SP_OFFS_ = _SP_OFFS_-8
1319  .endr
1320  .if _SP_OFFS_ - 32
1321    .error   "Debug_Round_Common: push/pop misalignment!"
1322  .endif
1323    popq    %rdi
1324    popq    %rsi
1325    ret
1326.endif
1327#----------------------------------------------------------------
1328    .section .note.GNU-stack,"",@progbits
1329
1330    .end
1331