# #---------------------------------------------------------------- # 32-bit x86 assembler code for Skein block functions using XMM registers # # Author: Doug Whiting, Hifn/Exar # # This code is released to the public domain. #---------------------------------------------------------------- # .text .altmacro #use advanced macro features .psize 0,128 #list file has no page boundaries # _MASK_ALL_ = (256+512+1024) #all three algorithm bits SAVE_REGS = 1 # ################# .ifndef SKEIN_USE_ASM _USE_ASM_ = _MASK_ALL_ .elseif SKEIN_USE_ASM & _MASK_ALL_ _USE_ASM_ = SKEIN_USE_ASM .else _USE_ASM_ = _MASK_ALL_ .endif # ################# .ifndef SKEIN_LOOP _SKEIN_LOOP = 002 #default is all fully unrolled, except Skein1024 .else _SKEIN_LOOP = SKEIN_LOOP .endif #-------------- # the unroll counts (0 --> fully unrolled) SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10 SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10 SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10 # SKEIN_ASM_UNROLL = 0 .irp _NN_,256,512,1024 .if (SKEIN_UNROLL_\_NN_) == 0 SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_ .endif .endr # ################# # .ifndef SKEIN_ROUNDS ROUNDS_256 = 72 ROUNDS_512 = 72 ROUNDS_1024 = 80 .else ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5) ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5) ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5) .irp _NN_,256,512,1024 .if _USE_ASM_ && \_NN_ .irp _RR_,%(ROUNDS_\_NN_) .if \_NN_ < 1024 .print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" .else .print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" .endif .endr .endif .endr .endif ################# # .ifdef SKEIN_CODE_SIZE _SKEIN_CODE_SIZE = (1) .else .ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined _SKEIN_CODE_SIZE = (1) .endif .endif # ################# # .ifndef SKEIN_DEBUG _SKEIN_DEBUG = 0 .else _SKEIN_DEBUG = 1 .endif ################# # # define offsets of fields in hash context structure # HASH_BITS = 0 ## bits of hash output BCNT = 4 + HASH_BITS #number of bytes in BUFFER[] TWEAK = 4 + BCNT #tweak values[0..1] X_VARS = 16 + TWEAK #chaining vars # #(Note: buffer[] in context structure is NOT needed here :-) # KW_PARITY_LO= 0xA9FC1A22 #overall parity of key schedule words (hi32/lo32) KW_PARITY_HI= 0x1BD11BDA FIRST_MASK8 = ~ (1 << 6) #FIRST block flag bit # # rotation constants for Skein # RC_256_0_0 = 14 RC_256_0_1 = 16 RC_256_1_0 = 52 RC_256_1_1 = 57 RC_256_2_0 = 23 RC_256_2_1 = 40 RC_256_3_0 = 5 RC_256_3_1 = 37 RC_256_4_0 = 25 RC_256_4_1 = 33 RC_256_5_0 = 46 RC_256_5_1 = 12 RC_256_6_0 = 58 RC_256_6_1 = 22 RC_256_7_0 = 32 RC_256_7_1 = 32 RC_512_0_0 = 46 RC_512_0_1 = 36 RC_512_0_2 = 19 RC_512_0_3 = 37 RC_512_1_0 = 33 RC_512_1_1 = 27 RC_512_1_2 = 14 RC_512_1_3 = 42 RC_512_2_0 = 17 RC_512_2_1 = 49 RC_512_2_2 = 36 RC_512_2_3 = 39 RC_512_3_0 = 44 RC_512_3_1 = 9 RC_512_3_2 = 54 RC_512_3_3 = 56 RC_512_4_0 = 39 RC_512_4_1 = 30 RC_512_4_2 = 34 RC_512_4_3 = 24 RC_512_5_0 = 13 RC_512_5_1 = 50 RC_512_5_2 = 10 RC_512_5_3 = 17 RC_512_6_0 = 25 RC_512_6_1 = 29 RC_512_6_2 = 39 RC_512_6_3 = 43 RC_512_7_0 = 8 RC_512_7_1 = 35 RC_512_7_2 = 56 RC_512_7_3 = 22 RC_1024_0_0 = 24 RC_1024_0_1 = 13 RC_1024_0_2 = 8 RC_1024_0_3 = 47 RC_1024_0_4 = 8 RC_1024_0_5 = 17 RC_1024_0_6 = 22 RC_1024_0_7 = 37 RC_1024_1_0 = 38 RC_1024_1_1 = 19 RC_1024_1_2 = 10 RC_1024_1_3 = 55 RC_1024_1_4 = 49 RC_1024_1_5 = 18 RC_1024_1_6 = 23 RC_1024_1_7 = 52 RC_1024_2_0 = 33 RC_1024_2_1 = 4 RC_1024_2_2 = 51 RC_1024_2_3 = 13 RC_1024_2_4 = 34 RC_1024_2_5 = 41 RC_1024_2_6 = 59 RC_1024_2_7 = 17 RC_1024_3_0 = 5 RC_1024_3_1 = 20 RC_1024_3_2 = 48 RC_1024_3_3 = 41 RC_1024_3_4 = 47 RC_1024_3_5 = 28 RC_1024_3_6 = 16 RC_1024_3_7 = 25 RC_1024_4_0 = 41 RC_1024_4_1 = 9 RC_1024_4_2 = 37 RC_1024_4_3 = 31 RC_1024_4_4 = 12 RC_1024_4_5 = 47 RC_1024_4_6 = 44 RC_1024_4_7 = 30 RC_1024_5_0 = 16 RC_1024_5_1 = 34 RC_1024_5_2 = 56 RC_1024_5_3 = 51 RC_1024_5_4 = 4 RC_1024_5_5 = 53 RC_1024_5_6 = 42 RC_1024_5_7 = 41 RC_1024_6_0 = 31 RC_1024_6_1 = 44 RC_1024_6_2 = 47 RC_1024_6_3 = 46 RC_1024_6_4 = 19 RC_1024_6_5 = 42 RC_1024_6_6 = 44 RC_1024_6_7 = 25 RC_1024_7_0 = 9 RC_1024_7_1 = 48 RC_1024_7_2 = 35 RC_1024_7_3 = 52 RC_1024_7_4 = 23 RC_1024_7_5 = 31 RC_1024_7_6 = 37 RC_1024_7_7 = 20 # #---------------------------------------------------------------- # declare allocated space on the stack .macro StackVar localName,localSize \localName = _STK_OFFS_ _STK_OFFS_ = _STK_OFFS_+(\localSize) .endm #StackVar # #---------------------------------------------------------------- # # MACRO: Configure stack frame, allocate local vars # .macro Setup_Stack WCNT,RND_CNT _STK_OFFS_ = 0 #starting offset from esp, forced on 16-byte alignment #----- local variables #<-- esp StackVar X_stk , 8*(WCNT) #local context vars StackVar Wcopy , 8*(WCNT) #copy of input block StackVar ksTwk ,16*3 #key schedule: tweak words StackVar ksKey ,16*(WCNT)+16#key schedule: key words FRAME_OFFS = ksTwk+128 #<-- ebp F_O = FRAME_OFFS #syntactic shorthand .if (SKEIN_ASM_UNROLL && (WCNT*64)) == 0 StackVar ksRot,16*(RND_CNT/4)#leave space for ks "rotation" to happen .endif LOCAL_SIZE = _STK_OFFS_ #size of local vars # #"restart" the stack defns, because we relocate esp to guarantee alignment # (i.e., these vars are NOT at fixed offsets from esp) _STK_OFFS_ = 0 #----- StackVar savRegs,8*4 #pushad data StackVar retAddr,4 #return address #----- caller parameters StackVar ctxPtr ,4 #context ptr StackVar blkPtr ,4 #pointer to block data StackVar blkCnt ,4 #number of full blocks to process StackVar bitAdd ,4 #bit count to add to tweak #----- caller's stack frame # # Notes on stack frame setup: # * the most used variable (except for Skein-256) is X_stk[], based at [esp+0] # * the next most used is the key schedule words # so ebp is "centered" there, allowing short offsets to the key/tweak # schedule in 256/512-bit Skein cases, but not posible for Skein-1024 :-( # * the Wcopy variables are infrequently accessed, and they have long # offsets from both esp and ebp only in the 1024-bit case. # * all other local vars and calling parameters can be accessed # with short offsets, except in the 1024-bit case # pushal #save all regs movl %esp,%ebx #keep ebx as pointer to caller parms subl $LOCAL_SIZE,%esp #make room for the locals andl $~15,%esp #force alignment movl ctxPtr(%ebx),%edi #edi --> Skein context leal FRAME_OFFS(%esp),%ebp #maximize use of short offsets from ebp movl blkCnt(%ebx),%ecx #keep block cnt in ecx .endm #Setup_Stack # #---------------------------------------------------------------- # .macro Reset_Stack,procStart movl %ebx,%esp #get rid of locals (wipe??) popal #restore all regs .endm # Reset_Stack # #---------------------------------------------------------------- # macros to help debug internals # .if _SKEIN_DEBUG .extern _Skein_Show_Block #calls to C routines .extern _Skein_Show_Round # SKEIN_RND_SPECIAL = 1000 SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 # .macro Skein_Debug_Block BLK_BITS # #void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, # const u08b_t *blkPtr, const u64b_t *wPtr, # const u64b_t *ksPtr,const u64b_t *tsPtr)# # call _Put_XMM_\BLK_BITS pushal #save all regs leal ksTwk+1-F_O(%ebp),%eax #+1 = flag: "stride" size = 2 qwords leal ksKey+1-F_O(%ebp),%esi leal Wcopy+32(%esp),%ecx #adjust offset by 32 for pushad movl ctxPtr(%ebx) ,%edx #ctx_hdr_ptr leal X_VARS(%edx) ,%edx #edx ==> cxt->X[] pushl %eax #tsPtr pushl %esi #ksPtr pushl %ecx #wPtr pushl blkPtr(%ebx) #blkPtr pushl %edx #ctx->Xptr pushl ctxPtr(%ebx) #ctx_hdr_ptr movl $\BLK_BITS,%eax pushl %eax #bits call _Skein_Show_Block addl $7*4,%esp #discard parameter space on stack popal #restore regs # call _Get_XMM_\BLK_BITS .endm #Skein_Debug_Block # .macro Skein_Debug_Round BLK_BITS,R,saveRegs=0 # #void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)# # .if \saveRegs call _Put_XMM_\BLK_BITS .endif pushal #save all regs .if R <> SKEIN_RND_FEED_FWD leal 32+X_stk(%esp),%eax #adjust offset by 32 for pushal .else movl ctxPtr(%ebx),%eax addl $X_VARS,%eax .endif pushl %eax #Xptr .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL) movl $\R,%eax .else #compute round number from edx, R leal 1+(((\R)-1) && 3)(,%edx,4),%eax .endif pushl %eax #round number pushl ctxPtr(%ebx) #ctx_hdr_ptr movl $\BLK_BITS,%eax pushl %eax #bits call _Skein_Show_Round addl $4*4,%esp #discard parameter space on stack popal #restore regs .if \saveRegs call _Get_XMM_\BLK_BITS #save internal vars for debug dump .endif .endm #Skein_Debug_Round .endif #ifdef SKEIN_DEBUG # #---------------------------------------------------------------- # useful macros .macro _ldX xn movq X_stk+8*(\xn)(%esp),%xmm\xn .endm .macro _stX xn movq %xmm\xn,X_stk+8*(\xn)(%esp) .endm # #---------------------------------------------------------------- # .macro C_label lName \lName: #use both "genders" to work across linkage conventions _\lName: .global \lName .global _\lName .endm # .if _USE_ASM_ & 256 # # void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# # ################# # # Skein-256 round macros # .macro R_256_OneRound _RR_,x0,x1,x2,x3,t0,t1 .irp _qq_,%((\_RR_) && 7) #figure out which rotation constants to use .if \x0 == 0 _RC0_ = RC_256_\_qq_&&_0 _RC1_ = RC_256_\_qq_&&_1 .else _RC0_ = RC_256_\_qq_&&_1 _RC1_ = RC_256_\_qq_&&_0 .endif .endr # paddq %xmm\x1,%xmm\x0 movq %xmm\x1,%xmm\t0 psllq $ _RC0_,%xmm\x1 psrlq $64-_RC0_,%xmm\t0 xorpd %xmm\x0,%xmm\x1 xorpd %xmm\t0,%xmm\x1 # paddq %xmm\x3,%xmm\x2 movq %xmm\x3,%xmm\t1 psllq $ _RC1_,%xmm\x3 psrlq $64-_RC1_,%xmm\t1 xorpd %xmm\x2,%xmm\x3 xorpd %xmm\t1,%xmm\x3 .if _SKEIN_DEBUG Skein_Debug_Round 256,%(\_RR_+1),SAVE_REGS .endif .endm #R_256_OneRound # .macro R_256_FourRounds _RN_ R_256_OneRound %(_RN_+0),0,1,2,3,4,5 R_256_OneRound (_RN_+1),2,1,0,3,4,5 R_256_OneRound (_RN_+2),0,1,2,3,4,5 R_256_OneRound (_RN_+3),2,1,0,3,4,5 #inject key schedule incl %edx #bump round number movd %edx,%xmm4 .if _UNROLL_CNT == (ROUNDS_256/8) #fully unrolled version _RK_ = ((_RN_)/4) #key injection counter paddq ksKey+16*((_RK_+1) % 5)-F_O(%ebp),%xmm0 paddq ksKey+16*((_RK_+2) % 5)-F_O(%ebp),%xmm1 paddq ksKey+16*((_RK_+3) % 5)-F_O(%ebp),%xmm2 paddq ksKey+16*((_RK_+4) % 5)-F_O(%ebp),%xmm3 paddq ksTwk+16*((_RK_+1) % 3)-F_O(%ebp),%xmm1 paddq ksTwk+16*((_RK_+2) % 3)-F_O(%ebp),%xmm2 paddq %xmm4,%xmm3 .else #looping version paddq ksKey+16*1-F_O(%esi),%xmm0 paddq ksKey+16*2-F_O(%esi),%xmm1 paddq ksKey+16*3-F_O(%esi),%xmm2 paddq ksKey+16*4-F_O(%esi),%xmm3 paddq ksTwk+16*1-F_O(%esi),%xmm1 paddq ksTwk+16*2-F_O(%esi),%xmm2 paddq %xmm4,%xmm3 # movq ksKey-F_O(%esi),%xmm4 #first, "rotate" key schedule on the stack movq ksTwk-F_O(%esi),%xmm5 # (for next time through) movq %xmm4,ksKey+16*(WCNT+1)-F_O(%esi) movq %xmm5,ksTwk+16*3-F_O(%esi) addl $16,%esi #bump rolling pointer .endif .if _SKEIN_DEBUG Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,SAVE_REGS .endif .endm #R256_FourRounds # .if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines _Put_XMM_256: .irp _NN_,0,1,2,3 movq %xmm\_NN_,X_stk+4+\_NN_*8(%esp) .endr ret # _Get_XMM_256: .irp _NN_,0,1,2,3 movq X_stk+4+_NN_*8(%esp),%xmm\_NN_ .endr ret .endif # ################# # # code # C_label Skein_256_Process_Block WCNT = 4 #WCNT=4 for Skein-256 Setup_Stack WCNT,ROUNDS_256 # main hash loop for Skein_256 Skein_256_block_loop: movd bitAdd (%ebx),%xmm4 movq TWEAK+0(%edi),%xmm5 movq TWEAK+8(%edi),%xmm6 paddq %xmm4 ,%xmm5 #bump T0 by the bitAdd parameter movq %xmm5,TWEAK(%edi) #save updated tweak value T0 (for next time) movapd %xmm6,%xmm7 xorpd %xmm5,%xmm7 #compute overall tweak parity movdqa %xmm5,ksTwk -F_O(%ebp)#save the expanded tweak schedule on the stack movdqa %xmm6,ksTwk+16-F_O(%ebp) movdqa %xmm7,ksTwk+32-F_O(%ebp) movl blkPtr(%ebx),%esi #esi --> input block movl $KW_PARITY_LO,%eax #init key schedule parity accumulator movl $KW_PARITY_HI,%edx movd %eax ,%xmm4 movd %edx ,%xmm0 unpcklps %xmm0,%xmm4 #replicate parity dword to 64 bits # .irp _NN_,0,1,2,3 #copy in the chaining vars movq X_VARS+8*\_NN_(%edi),%xmm\_NN_ xorpd %xmm\_NN_,%xmm4 #update overall parity movdqa %xmm\_NN_,ksKey+16*_NN_-F_O(%ebp) .endr movdqa %xmm4,ksKey+16*WCNT-F_O(%ebp)#save overall parity at the end of the array # paddq %xmm5,%xmm1 #inject the initial tweak words paddq %xmm6,%xmm2 # .irp _NN_,0,1,2,3 #perform the initial key injection movq 8*\_NN_(%esi),%xmm4#and save a copy of the input block on stack movq %xmm4,8*\_NN_+Wcopy(%esp) paddq %xmm4,%xmm\_NN_ #inject the key word .endr # .if _SKEIN_DEBUG #debug dump of state at this point Skein_Debug_Block 256 Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL,SAVE_REGS .endif addl $WCNT*8,%esi #skip to the next block movl %esi,blkPtr(%ebx) #save the updated block pointer # # now the key schedule is computed. Start the rounds # xorl %edx,%edx #edx = iteration count .if SKEIN_ASM_UNROLL & 256 _UNROLL_CNT = ROUNDS_256/8 #fully unrolled .else _UNROLL_CNT = SKEIN_UNROLL_256 #partial unroll count .if ((ROUNDS_256/8) % _UNROLL_CNT) .error "Invalid SKEIN_UNROLL_256" #sanity check .endif movl %ebp,%esi #use this as "rolling" pointer into ksTwk/ksKey Skein_256_round_loop: # (since there's no 16* scaled address mode) .endif # _Rbase_ = 0 .rept _UNROLL_CNT*2 # here with X[0..3] in XMM0..XMM3 R_256_FourRounds _Rbase_ _Rbase_ = _Rbase_+4 .endr #rept _UNROLL_CNT*2 # .if _UNROLL_CNT <> (ROUNDS_256/8) cmpl $2*(ROUNDS_256/8),%edx jb Skein_256_round_loop .endif #---------------------------- # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} .irp _NN_,0,1,2,3 movq Wcopy+8*\_NN_(%esp),%xmm4 xorpd %xmm4,%xmm\_NN_ movq %xmm\_NN_,X_VARS+8*\_NN_(%edi) .endr andb $FIRST_MASK8,TWEAK +15(%edi) .if _SKEIN_DEBUG Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,SAVE_REGS .endif # go back for more blocks, if needed decl %ecx jnz Skein_256_block_loop Reset_Stack _Skein_256_Process_Block ret # .ifdef _SKEIN_CODE_SIZE C_label Skein_256_Process_Block_CodeSize movl $_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block,%eax ret # C_label Skein_256_Unroll_Cnt .if _UNROLL_CNT <> ROUNDS_256/8 movl $_UNROLL_CNT,%eax .else xorl %eax,%eax .endif ret .endif .endif #_USE_ASM_ & 256 # #---------------------------------------------------------------- # .if _USE_ASM_ & 512 # # void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# # ################# # MACRO: one round # .macro R_512_Round _RR_, a0,a1,Ra, b0,b1,Rb, c0,c1,Rc, d0,d1,Rd .irp _qq_,%((\_RR_) && 7) _Ra_ = RC_512_\_qq_&&_\Ra _Rb_ = RC_512_\_qq_&&_\Rb _Rc_ = RC_512_\_qq_&&_\Rc _Rd_ = RC_512_\_qq_&&_\Rd .endr paddq %xmm\a1 , %xmm\a0 _stX c0 movq %xmm\a1 , %xmm\c0 psllq $ _Ra_ , %xmm\a1 psrlq $64-_Ra_ , %xmm\c0 xorpd %xmm\c0 , %xmm\a1 xorpd %xmm\a0 , %xmm\a1 paddq %xmm\b1 , %xmm\b0 _stX a0 movq %xmm\b1 , %xmm\a0 psllq $ _Rb_ , %xmm\b1 psrlq $64-_Rb_ , %xmm\a0 xorpd %xmm\b0 , %xmm\b1 _ldX c0 xorpd %xmm\a0 , %xmm\b1 paddq %xmm\c1 , %xmm\c0 movq %xmm\c1 , %xmm\a0 psllq $ _Rc_ , %xmm\c1 psrlq $64-_Rc_ , %xmm\a0 xorpd %xmm\c0 , %xmm\c1 xorpd %xmm\a0 , %xmm\c1 paddq %xmm\d1 , %xmm\d0 movq %xmm\d1 , %xmm\a0 psllq $ _Rd_ , %xmm\d1 psrlq $64-_Rd_ , %xmm\a0 xorpd %xmm\a0 , %xmm\d1 _ldX a0 xorpd %xmm\d0 , %xmm\d1 .if _SKEIN_DEBUG Skein_Debug_Round 512,%(_RR_+1),SAVE_REGS .endif .endm # # MACRO: four rounds .macro R_512_FourRounds _RN_ R_512_Round %((_RN_) ), 0,1,0, 2,3,1, 4,5,2, 6,7,3 R_512_Round %((_RN_)+1), 2,1,0, 4,7,1, 6,5,2, 0,3,3 R_512_Round %((_RN_)+2), 4,1,0, 6,3,1, 0,5,2, 2,7,3 R_512_Round %((_RN_)+3), 6,1,0, 0,7,1, 2,5,2, 4,3,3 #inject key schedule .irp _NN_,0,1,2,3,4,5,6,7 .if _UNROLL_CNT == (ROUNDS_512/8) paddq ksKey+16*((((\_RN_)/4)+(\_NN_)+1)%9)-F_O(%ebp),%xmm\_NN_ .else paddq ksKey+16*((\_NN_)+1)-F_O(%esi),%xmm\_NN_ .endif .endr _stX 0 #free up a register incl %edx #bump round counter movd %edx,%xmm0 #inject the tweak .if _UNROLL_CNT == (ROUNDS_512/8) paddq ksTwk+16*(((_RN_)+1) % 3)-F_O(%ebp),%xmm5 paddq ksTwk+16*(((_RN_)+2) % 3)-F_O(%ebp),%xmm6 paddq %xmm0 ,%xmm7 .else #looping version paddq ksTwk+16*1-F_O(%esi),%xmm5 paddq ksTwk+16*2-F_O(%esi),%xmm6 paddq %xmm0 ,%xmm7 # "rotate" key schedule on the stack (for next time through) movq ksKey -F_O(%esi),%xmm0 movq %xmm0,ksKey+16*(WCNT+1)-F_O(%esi) movq ksTwk -F_O(%esi),%xmm0 movq %xmm0,ksTwk+16*3 -F_O(%esi) addl $16,%esi #bump rolling pointer .endif _ldX 0 #restore X0 .if _SKEIN_DEBUG Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT,SAVE_REGS .endif .endm #R_512_FourRounds ################# .if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines _Put_XMM_512: .irp _NN_,0,1,2,3,4,5,6,7 movq %xmm\_NN_,X_stk+4+\_NN_*8(%esp) .endr ret # _Get_XMM_512: .irp _NN_,0,1,2,3,4,5,6,7 movq X_stk+4+\_NN_*8(%esp),%xmm\_NN_ .endr ret .endif # ################# # C_label Skein_512_Process_Block WCNT = 8 #WCNT=8 for Skein-512 Setup_Stack WCNT,ROUNDS_512 # main hash loop for Skein_512 Skein_512_block_loop: movd bitAdd(%ebx) ,%xmm0 movq TWEAK+0(%edi),%xmm1 movq TWEAK+8(%edi),%xmm2 paddq %xmm0,%xmm1 #bump T0 by the bitAdd parameter movq %xmm1,TWEAK(%edi) #save updated tweak value T0 (for next time) movq %xmm2,%xmm0 xorpd %xmm1,%xmm0 #compute overall tweak parity movdqa %xmm1,ksTwk -F_O(%ebp)#save the expanded tweak schedule on the stack movdqa %xmm2,ksTwk+16*1-F_O(%ebp) movdqa %xmm0,ksTwk+16*2-F_O(%ebp) movl blkPtr(%ebx),%esi #esi --> input block movl $KW_PARITY_LO,%eax #init key schedule parity accumulator movl $KW_PARITY_HI,%edx movd %eax ,%xmm0 movd %edx ,%xmm7 unpcklps %xmm7,%xmm0 #replicate parity dword to 64 bits # .irp _NN_,7,6,5,4,3,2,1 #copy in the chaining vars (skip #0 for now) movq X_VARS+8*\_NN_(%edi),%xmm\_NN_ xorpd %xmm\_NN_,%xmm0 #update overall parity movdqa %xmm\_NN_,ksKey+16*\_NN_-F_O(%ebp) .if \_NN_ == 5 paddq %xmm1,%xmm5 #inject the initial tweak words paddq %xmm2,%xmm6 # (before they get trashed in %xmm1/2) .endif .endr movq X_VARS(%edi),%xmm4 #handle #0 now xorpd %xmm4,%xmm0 #update overall parity movdqa %xmm4,ksKey+16* 0 -F_O(%ebp) #save the key value in slot #0 movdqa %xmm0,ksKey+16*WCNT-F_O(%ebp) #save overall parity at the end of the array # movq %xmm4,%xmm0 .irp _NN_,7,6,5, 4,3,2,1,0 #perform the initial key injection (except #4) movq 8*\_NN_(%esi),%xmm4 #and save a copy of the input block on stack movq %xmm4,8*\_NN_+Wcopy(%esp) paddq %xmm4,%xmm\_NN_ .endr movq 8*4(%esi),%xmm4 #get input block word #4 movq %xmm4,8*4+Wcopy(%esp) paddq ksKey+16*4-F_O(%ebp),%xmm4#inject the initial key # .if _SKEIN_DEBUG #debug dump of state at this point Skein_Debug_Block 512 Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL,SAVE_REGS .endif addl $WCNT*8,%esi #skip to the next block movl %esi,blkPtr(%ebx) #save the updated block pointer # # now the key schedule is computed. Start the rounds # xorl %edx,%edx #edx = round counter .if SKEIN_ASM_UNROLL & 512 _UNROLL_CNT = ROUNDS_512/8 .else _UNROLL_CNT = SKEIN_UNROLL_512 .if ((ROUNDS_512/8) % _UNROLL_CNT) .error "Invalid SKEIN_UNROLL_512" .endif movl %ebp,%esi #use this as "rolling" pointer into ksTwk/ksKey Skein_512_round_loop: # (since there's no 16* scaled address mode) .endif _Rbase_ = 0 .rept _UNROLL_CNT*2 R_512_FourRounds %_Rbase_ _Rbase_ = _Rbase_+4 .endr #rept _UNROLL_CNT # .if (SKEIN_ASM_UNROLL & 512) == 0 cmpl $2*(ROUNDS_512/8),%edx jb Skein_512_round_loop .endif #---------------------------- # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} andb $FIRST_MASK8,TWEAK +15(%edi) .irp _NN_,0,2,4,6 #do the aligned ones first xorpd Wcopy+8*\_NN_(%esp),%xmm\_NN_ movq %xmm\_NN_,X_VARS+8*_NN_(%edi) .endr .irp _NN_,1,3,5,7 #now we have some register space available movq Wcopy+8*\_NN_(%esp),%xmm0 xorpd %xmm0,%xmm&\_NN_ movq %xmm&\_NN_,X_VARS+8*\_NN_(%edi) .endr .if _SKEIN_DEBUG Skein_Debug_Round 512,SKEIN_RND_FEED_FWD .endif # go back for more blocks, if needed decl %ecx jnz Skein_512_block_loop Reset_Stack _Skein_512_Process_Block ret # .ifdef _SKEIN_CODE_SIZE C_label Skein_512_Process_Block_CodeSize movl $(_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block),%eax ret # C_label Skein_512_Unroll_Cnt .if _UNROLL_CNT <> ROUNDS_512/8 movl $_UNROLL_CNT,%eax .else xorl %eax,%eax .endif ret .endif # .endif # _USE_ASM_ & 512 # #---------------------------------------------------------------- # .if _USE_ASM_ & 1024 .global _Skein1024_Process_Block # # void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# # R_1024_REGS = (5) #keep this many block variables in registers # ################ .if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines _Put_XMM_1024: _NN_ = 0 .rept R_1024_REGS .irp _rr_,%(_NN_) movq %xmm\_rr_,X_stk+4+8*_NN_(%esp) .endr _NN_ = _NN_+1 .endr ret # _Get_XMM_1024: _NN_ = 0 .rept R_1024_REGS .irp _rr_,%(_NN_) movq X_stk+4+8*_NN_(%esp),%xmm\_rr_ .endr _NN_ = _NN_+1 .endr ret .endif # ################# # MACRO: one mix step .macro MixStep_1024 x0,x1,rotIdx0,rotIdx1,_debug_=0 _r0_ = \x0 #default, if already loaded _r1_ = \x1 # load the regs (if necessary) .if (\x0 >= R_1024_REGS) _r0_ = 5 movq X_stk+8*(\x0)(%esp),%xmm5 .endif .if (\x1 >= R_1024_REGS) _r1_ = 6 movq X_stk+8*(\x1)(%esp),%xmm6 .endif # do the mix .irp _rx_,%((rotIdx0) && 7) _Rc_ = RC_1024_\_rx_&&_\rotIdx1 #rotation constant .endr .irp _x0_,%_r0_ .irp _x1_,%_r1_ paddq %xmm\_x1_,%xmm\_x0_ movq %xmm\_x1_,%xmm7 psllq $ _Rc_ ,%xmm\_x1_ psrlq $64-_Rc_ ,%xmm7 xorpd %xmm\_x0_,%xmm\_x1_ xorpd %xmm7 ,%xmm\_x1_ .endr .endr # save the regs (if necessary) .if (\x0 >= R_1024_REGS) movq %xmm5,X_stk+8*(\x0)(%esp) .endif .if (\x1 >= R_1024_REGS) movq %xmm6,X_stk+8*(\x1)(%esp) .endif # debug output .if _SKEIN_DEBUG && (\_debug_) Skein_Debug_Round 1024,%((\RotIdx0)+1),SAVE_REGS .endif .endm ################# # MACRO: four rounds # .macro R_1024_FourRounds _RR_ #--------- round _RR_ MixStep_1024 0, 1,%((\_RR_)+0),0 MixStep_1024 2, 3,%((\_RR_)+0),1 MixStep_1024 4, 5,%((\_RR_)+0),2 MixStep_1024 6, 7,%((\_RR_)+0),3 MixStep_1024 8, 9,%((\_RR_)+0),4 MixStep_1024 10,11,%((\_RR_)+0),5 MixStep_1024 12,13,%((\_RR_)+0),6 MixStep_1024 14,15,%((\_RR_)+0),7,1 #--------- round _RR_+1 MixStep_1024 0, 9,%((\_RR_)+1),0 MixStep_1024 2,13,%((\_RR_)+1),1 MixStep_1024 6,11,%((\_RR_)+1),2 MixStep_1024 4,15,%((\_RR_)+1),3 MixStep_1024 10, 7,%((\_RR_)+1),4 MixStep_1024 12, 3,%((\_RR_)+1),5 MixStep_1024 14, 5,%((\_RR_)+1),6 MixStep_1024 8, 1,%((\_RR_)+1),7,1 #--------- round _RR_+2 MixStep_1024 0, 7,%((\_RR_)+2),0 MixStep_1024 2, 5,%((\_RR_)+2),1 MixStep_1024 4, 3,%((\_RR_)+2),2 MixStep_1024 6, 1,%((\_RR_)+2),3 MixStep_1024 12,15,%((\_RR_)+2),4 MixStep_1024 14,13,%((\_RR_)+2),5 MixStep_1024 8,11,%((\_RR_)+2),6 MixStep_1024 10, 9,%((\_RR_)+2),7,1 #--------- round _RR_+3 MixStep_1024 0,15,%((\_RR_)+3),0 MixStep_1024 2,11,%((\_RR_)+3),1 MixStep_1024 6,13,%((\_RR_)+3),2 MixStep_1024 4, 9,%((\_RR_)+3),3 MixStep_1024 14, 1,%((\_RR_)+3),4 MixStep_1024 8, 5,%((\_RR_)+3),5 MixStep_1024 10, 3,%((\_RR_)+3),6 MixStep_1024 12, 7,%((\_RR_)+3),7,1 incl %edx #edx = round number movd %edx,%xmm7 #inject the key .irp _NN_,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .if _UNROLL_CNT <> (ROUNDS_1024/8) .if \_NN_ < R_1024_REGS paddq ksKey+16*\_NN_+16-F_O(%esi),%xmm&\_NN_ .else movq X_stk+ 8*\_NN_(%esp),%xmm6 .if \_NN_ == 15 paddq %xmm7,%xmm6 .elseif \_NN_ == 14 paddq ksTwk+16*2-F_O(%esi),%xmm6 .elseif \_NN_ == 13 paddq ksTwk+16*1-F_O(%esi),%xmm6 .endif paddq ksKey+16*\_NN_+16-F_O(%esi),%xmm6 movq %xmm6,X_stk+ 8*\_NN_(%esp) .endif .else .if \_NN_ < R_1024_REGS paddq ksKey+16*(((_Rbase_/4)+(\_NN_)+1) % 17)-F_O(%ebp),%xmm&\_NN_ .else movq X_stk+ 8*\_NN_(%esp), %xmm6 paddq ksKey+16*(((_Rbase_/4)+(\_NN_)+1) % 17)-F_O(%ebp),%xmm6 .if \_NN_ == 15 paddq %xmm7,%xmm6 .elseif \_NN_ == 14 paddq ksTwk+16*(((_Rbase_/4)+2) % 3)-F_O(%ebp),%xmm6 .elseif \_NN_ == 13 paddq ksTwk+16*(((_Rbase_/4)+1) % 3)-F_O(%ebp),%xmm6 .endif movq %xmm6,X_stk+ 8*\_NN_(%esp) .endif .endif .endr .if _UNROLL_CNT <> (ROUNDS_1024/8) #rotate the key schedule on the stack movq ksKey-F_O(%esi), %xmm6 movq ksTwk-F_O(%esi), %xmm7 movq %xmm6,ksKey+16*(WCNT+1)-F_O(%esi) movq %xmm7,ksTwk+16* 3 -F_O(%esi) addl $16,%esi #bump rolling pointer .endif .if _SKEIN_DEBUG Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,SAVE_REGS .endif .endm #R_1024_FourRounds # ################ # C_label Skein1024_Process_Block # WCNT = 16 #WCNT=16 for Skein-1024 Setup_Stack WCNT,ROUNDS_1024 addl $0x80,%edi #bias the edi ctxt offsets to keep them all short # main hash loop for Skein1024 Skein1024_block_loop: movd bitAdd(%ebx) ,%xmm0 movq TWEAK+0-0x80(%edi),%xmm1 movq TWEAK+8-0x80(%edi),%xmm2 paddq %xmm0,%xmm1 #bump T0 by the bitAdd parameter movq %xmm1,TWEAK-0x80(%edi) #save updated tweak value T0 (for next time) movq %xmm2,%xmm0 xorpd %xmm1,%xmm0 #compute overall tweak parity movdqa %xmm1,ksTwk -F_O(%ebp)#save the expanded tweak schedule on the stack movdqa %xmm2,ksTwk+16-F_O(%ebp) movdqa %xmm0,ksTwk+32-F_O(%ebp) movl blkPtr(%ebx),%esi #esi --> input block movl $KW_PARITY_LO,%eax #init key schedule parity accumulator movl $KW_PARITY_HI,%edx movd %eax ,%xmm7 movd %edx ,%xmm6 unpcklps %xmm6,%xmm7 #replicate parity dword to 64 bits # leal 0x80(%esp),%eax #use short offsets for Wcopy, X_stk writes below .irp _NN_,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 movq X_VARS+8*\_NN_-0x80(%edi),%xmm6 xorpd %xmm6,%xmm7 #update overall parity movdqa %xmm6,ksKey+16*\_NN_-F_O(%ebp) #save the key schedule on the stack .if \_NN_ < R_1024_REGS _rr_ = \_NN_ .else _rr_ = R_1024_REGS .endif .irp _rn_,%(_rr_) movq 8*\_NN_(%esi),%xmm\_rn_ #save copy of the input block on stack movq %xmm\_rn_,Wcopy+8*\_NN_-0x80(%eax) #(for feedforward later) paddq %xmm6,%xmm\_rn_ #inject the key into the block .if \_NN_ == 13 paddq %xmm1,%xmm\_rn_ #inject the initial tweak words .elseif \_NN_ == 14 paddq %xmm2,%xmm\_rn_ .endif .if \_NN_ >= R_1024_REGS #only save X[5..15] on stack, leave X[0..4] in regs movq %xmm\_rn_,X_stk+8*\_NN_-0x80(%eax) .endif .endr .endr movdqa %xmm7,ksKey+16*WCNT-F_O(%ebp) #save overall key parity at the end of the array # .if _SKEIN_DEBUG #debug dump of state at this point Skein_Debug_Block 1024 Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL,SAVE_REGS .endif addl $WCNT*8,%esi #skip to the next block movl %esi,blkPtr(%ebx) #save the updated block pointer # # now the key schedule is computed. Start the rounds # xorl %edx,%edx #edx = round counter .if SKEIN_ASM_UNROLL & 1024 _UNROLL_CNT = ROUNDS_1024/8 .else _UNROLL_CNT = SKEIN_UNROLL_1024 .if ((ROUNDS_1024/8) % _UNROLL_CNT) .error "Invalid SKEIN_UNROLL_1024" .endif movl %ebp,%esi #use this as "rolling" pointer into ksTwk/ksKey Skein_1024_round_loop: .endif # _Rbase_ = 0 .rept _UNROLL_CNT*2 R_1024_FourRounds %_Rbase_ _Rbase_ = _Rbase_+4 .endr #rept _UNROLL_CNT # .if (SKEIN_ASM_UNROLL & 1024) == 0 cmp $2*(ROUNDS_1024/8),%edx jb Skein_1024_round_loop .endif andb $FIRST_MASK8,TWEAK +15-0x80(%edi) #clear tweak bit for next time thru #---------------------------- # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} leal 0x80(%esp),%eax #allow short offsets to X_stk and Wcopy .irp _NN_,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .if \_NN_ < R_1024_REGS .if \_NN_ && 1 #already in regs: no load needed movq Wcopy+ 8*\_NN_-0x80(%eax),%xmm7 #unaligned xorpd %xmm7,%xmm\_NN_ .else xorpd Wcopy+ 8*\_NN_-0x80(%eax),%xmm\_NN_ #aligned .endif movq %xmm\_NN_,X_VARS+8*\_NN_-0x80(%edi) .else movq X_stk+8*\_NN_-0x80(%eax),%xmm7 #load X value from stack .if \_NN_ && 1 movq Wcopy+8*\_NN_-0x80(%eax),%xmm6 #unaligned xorpd %xmm6,%xmm7 .else xorpd Wcopy+8*\_NN_-0x80(%eax),%xmm7 #aligned .endif movq %xmm7,X_VARS+8*\_NN_-0x80(%edi) .endif .endr .if _SKEIN_DEBUG Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD #no need to save regs on stack here .endif # go back for more blocks, if needed decl %ecx jnz Skein1024_block_loop Reset_Stack _Skein1024_Process_Block ret # .ifdef _SKEIN_CODE_SIZE C_label Skein1024_Process_Block_CodeSize movl $(_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block),%eax ret # C_label Skein1024_Unroll_Cnt .if _UNROLL_CNT <> ROUNDS_1024/8 movl $_UNROLL_CNT,%eax .else xorl %eax,%eax .endif ret .endif # .endif # _USE_ASM_ & 1024 #---------------------------------------------------------------- .end