#
#----------------------------------------------------------------
# 32-bit x86 assembler code for Skein block functions using XMM registers
#
# Author: Doug Whiting, Hifn/Exar
#
# This code is released to the public domain.
#----------------------------------------------------------------
#
    .text
    .altmacro                               #use advanced macro features
    .psize 0,128                            #list file has no page boundaries
#
_MASK_ALL_  =   (256+512+1024)              #all three algorithm bits
SAVE_REGS   =   1
#
#################
.ifndef SKEIN_USE_ASM
_USE_ASM_        = _MASK_ALL_
.elseif SKEIN_USE_ASM & _MASK_ALL_
_USE_ASM_        = SKEIN_USE_ASM
.else
_USE_ASM_        = _MASK_ALL_
.endif
#
#################
.ifndef SKEIN_LOOP  
_SKEIN_LOOP       = 002                     #default is all fully unrolled, except Skein1024
.else
_SKEIN_LOOP       = SKEIN_LOOP
.endif
#--------------
# the unroll counts (0 --> fully unrolled)
SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) % 10
SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) % 10
SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) % 10
#
SKEIN_ASM_UNROLL  = 0
  .irp _NN_,256,512,1024
    .if (SKEIN_UNROLL_\_NN_) == 0
SKEIN_ASM_UNROLL  = SKEIN_ASM_UNROLL + \_NN_
    .endif
  .endr
#
#################
#
.ifndef SKEIN_ROUNDS
ROUNDS_256  =   72
ROUNDS_512  =   72
ROUNDS_1024 =   80
.else
ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) % 10) + 5)
ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) % 10) + 5)
.irp _NN_,256,512,1024
  .if _USE_ASM_ && \_NN_
    .irp _RR_,%(ROUNDS_\_NN_)
      .if \_NN_ < 1024
.print  "+++ SKEIN_ROUNDS_\_NN_  = \_RR_"
      .else
.print  "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
      .endif
    .endr
  .endif
.endr
.endif
#################
#
.ifdef SKEIN_CODE_SIZE
_SKEIN_CODE_SIZE = (1)
.else
.ifdef  SKEIN_PERF                           #use code size if SKEIN_PERF is defined
_SKEIN_CODE_SIZE = (1)
.endif
.endif
#
#################
#
.ifndef SKEIN_DEBUG
_SKEIN_DEBUG      = 0
.else
_SKEIN_DEBUG      = 1
.endif
#################
#
# define offsets of fields in hash context structure
#
HASH_BITS   =   0                           ## bits of hash output
BCNT        =   4 + HASH_BITS               #number of bytes in BUFFER[]
TWEAK       =   4 + BCNT                    #tweak values[0..1]
X_VARS      =  16 + TWEAK                   #chaining vars
#
#(Note: buffer[] in context structure is NOT needed here :-)
#
KW_PARITY_LO=   0xA9FC1A22                  #overall parity of key schedule words (hi32/lo32)
KW_PARITY_HI=   0x1BD11BDA
FIRST_MASK8 =   ~ (1 << 6)                  #FIRST block flag bit
#
# rotation constants for Skein
#
RC_256_0_0  = 14
RC_256_0_1  = 16

RC_256_1_0  = 52
RC_256_1_1  = 57

RC_256_2_0  = 23
RC_256_2_1  = 40

RC_256_3_0  =  5
RC_256_3_1  = 37

RC_256_4_0  = 25
RC_256_4_1  = 33

RC_256_5_0  = 46
RC_256_5_1  = 12

RC_256_6_0  = 58
RC_256_6_1  = 22

RC_256_7_0  = 32
RC_256_7_1  = 32

RC_512_0_0  = 46
RC_512_0_1  = 36
RC_512_0_2  = 19
RC_512_0_3  = 37

RC_512_1_0  = 33
RC_512_1_1  = 27
RC_512_1_2  = 14
RC_512_1_3  = 42

RC_512_2_0  = 17
RC_512_2_1  = 49
RC_512_2_2  = 36
RC_512_2_3  = 39

RC_512_3_0  = 44
RC_512_3_1  =  9
RC_512_3_2  = 54
RC_512_3_3  = 56

RC_512_4_0  = 39
RC_512_4_1  = 30
RC_512_4_2  = 34
RC_512_4_3  = 24

RC_512_5_0  = 13
RC_512_5_1  = 50
RC_512_5_2  = 10
RC_512_5_3  = 17

RC_512_6_0  = 25
RC_512_6_1  = 29
RC_512_6_2  = 39
RC_512_6_3  = 43

RC_512_7_0  =  8
RC_512_7_1  = 35
RC_512_7_2  = 56
RC_512_7_3  = 22

RC_1024_0_0 = 24
RC_1024_0_1 = 13
RC_1024_0_2 =  8
RC_1024_0_3 = 47
RC_1024_0_4 =  8
RC_1024_0_5 = 17
RC_1024_0_6 = 22
RC_1024_0_7 = 37

RC_1024_1_0 = 38
RC_1024_1_1 = 19
RC_1024_1_2 = 10
RC_1024_1_3 = 55
RC_1024_1_4 = 49
RC_1024_1_5 = 18
RC_1024_1_6 = 23
RC_1024_1_7 = 52

RC_1024_2_0 = 33
RC_1024_2_1 =  4
RC_1024_2_2 = 51
RC_1024_2_3 = 13
RC_1024_2_4 = 34
RC_1024_2_5 = 41
RC_1024_2_6 = 59
RC_1024_2_7 = 17

RC_1024_3_0 =  5
RC_1024_3_1 = 20
RC_1024_3_2 = 48
RC_1024_3_3 = 41
RC_1024_3_4 = 47
RC_1024_3_5 = 28
RC_1024_3_6 = 16
RC_1024_3_7 = 25

RC_1024_4_0 = 41
RC_1024_4_1 =  9
RC_1024_4_2 = 37
RC_1024_4_3 = 31
RC_1024_4_4 = 12
RC_1024_4_5 = 47
RC_1024_4_6 = 44
RC_1024_4_7 = 30

RC_1024_5_0 = 16
RC_1024_5_1 = 34
RC_1024_5_2 = 56
RC_1024_5_3 = 51
RC_1024_5_4 =  4
RC_1024_5_5 = 53
RC_1024_5_6 = 42
RC_1024_5_7 = 41

RC_1024_6_0 = 31
RC_1024_6_1 = 44
RC_1024_6_2 = 47
RC_1024_6_3 = 46
RC_1024_6_4 = 19
RC_1024_6_5 = 42
RC_1024_6_6 = 44
RC_1024_6_7 = 25

RC_1024_7_0 =  9
RC_1024_7_1 = 48
RC_1024_7_2 = 35
RC_1024_7_3 = 52
RC_1024_7_4 = 23
RC_1024_7_5 = 31
RC_1024_7_6 = 37
RC_1024_7_7 = 20
#
#----------------------------------------------------------------
# declare allocated space on the stack
.macro StackVar  localName,localSize
\localName  =   _STK_OFFS_
_STK_OFFS_  =   _STK_OFFS_+(\localSize)
.endm #StackVar
#
#----------------------------------------------------------------
#
# MACRO: Configure stack frame, allocate local vars
#
.macro Setup_Stack WCNT,RND_CNT
_STK_OFFS_  =   0                   #starting offset from esp, forced on 16-byte alignment
    #----- local  variables         #<-- esp
    StackVar    X_stk  , 8*(WCNT)   #local context vars
    StackVar    Wcopy  , 8*(WCNT)   #copy of input block    
    StackVar    ksTwk  ,16*3        #key schedule: tweak words
    StackVar    ksKey  ,16*(WCNT)+16#key schedule: key   words
FRAME_OFFS  =   ksTwk+128           #<-- ebp
F_O         =   FRAME_OFFS          #syntactic shorthand
  .if (SKEIN_ASM_UNROLL && (WCNT*64)) == 0
    StackVar    ksRot,16*(RND_CNT/4)#leave space for ks "rotation" to happen
  .endif
LOCAL_SIZE  =   _STK_OFFS_          #size of local vars
    #
    #"restart" the stack defns, because we relocate esp to guarantee alignment
    #    (i.e., these vars are NOT at fixed offsets from esp)
_STK_OFFS_  =   0
    #----- 
    StackVar    savRegs,8*4         #pushad data
    StackVar    retAddr,4           #return address
    #----- caller parameters
    StackVar    ctxPtr ,4           #context ptr
    StackVar    blkPtr ,4           #pointer to block data
    StackVar    blkCnt ,4           #number of full blocks to process
    StackVar    bitAdd ,4           #bit count to add to tweak
    #----- caller's stack frame
#
# Notes on stack frame setup:
#   * the most used variable (except for Skein-256) is X_stk[], based at [esp+0]
#   * the next most used is the key schedule words
#       so ebp is "centered" there, allowing short offsets to the key/tweak
#       schedule in 256/512-bit Skein cases, but not posible for Skein-1024 :-(
#   * the Wcopy variables are infrequently accessed, and they have long 
#       offsets from both esp and ebp only in the 1024-bit case.
#   * all other local vars and calling parameters can be accessed 
#       with short offsets, except in the 1024-bit case
#
    pushal                          #save all regs
    movl    %esp,%ebx               #keep ebx as pointer to caller parms
    subl    $LOCAL_SIZE,%esp        #make room for the locals
    andl    $~15,%esp               #force alignment
    movl    ctxPtr(%ebx),%edi       #edi --> Skein context
    leal    FRAME_OFFS(%esp),%ebp   #maximize use of short offsets from ebp
    movl    blkCnt(%ebx),%ecx       #keep block cnt in ecx
.endm #Setup_Stack
#
#----------------------------------------------------------------
#
.macro Reset_Stack,procStart
    movl     %ebx,%esp              #get rid of locals (wipe??)
    popal                           #restore all regs
.endm # Reset_Stack
#
#----------------------------------------------------------------
# macros to help debug internals
#
.if _SKEIN_DEBUG
    .extern   _Skein_Show_Block   #calls to C routines
    .extern   _Skein_Show_Round
#
SKEIN_RND_SPECIAL       =   1000
SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
#
.macro Skein_Debug_Block BLK_BITS
#
#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
#                     const u08b_t *blkPtr, const u64b_t *wPtr, 
#                     const u64b_t *ksPtr,const u64b_t *tsPtr)#
#
    call    _Put_XMM_\BLK_BITS
    pushal                          #save all regs
    leal    ksTwk+1-F_O(%ebp),%eax  #+1 = flag: "stride" size = 2 qwords
    leal    ksKey+1-F_O(%ebp),%esi
    leal    Wcopy+32(%esp),%ecx     #adjust offset by 32 for pushad
    movl    ctxPtr(%ebx)  ,%edx     #ctx_hdr_ptr
    leal    X_VARS(%edx)  ,%edx     #edx ==> cxt->X[]
    pushl   %eax                    #tsPtr
    pushl   %esi                    #ksPtr
    pushl   %ecx                    #wPtr
    pushl   blkPtr(%ebx)            #blkPtr
    pushl   %edx                    #ctx->Xptr
    pushl   ctxPtr(%ebx)            #ctx_hdr_ptr
    movl    $\BLK_BITS,%eax
    pushl   %eax                    #bits
    call    _Skein_Show_Block
    addl    $7*4,%esp               #discard parameter space on stack
    popal                           #restore regs
#
    call    _Get_XMM_\BLK_BITS
.endm #Skein_Debug_Block

#
.macro Skein_Debug_Round BLK_BITS,R,saveRegs=0
#
#void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)#
#
  .if \saveRegs
    call    _Put_XMM_\BLK_BITS
  .endif
    pushal                          #save all regs
  .if R <> SKEIN_RND_FEED_FWD
    leal    32+X_stk(%esp),%eax     #adjust offset by 32 for pushal
  .else
    movl    ctxPtr(%ebx),%eax
    addl    $X_VARS,%eax
  .endif
    pushl   %eax                    #Xptr
  .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
    movl    $\R,%eax
  .else     #compute round number from edx, R
    leal    1+(((\R)-1) && 3)(,%edx,4),%eax
  .endif
    pushl   %eax                    #round number
    pushl   ctxPtr(%ebx)            #ctx_hdr_ptr
    movl    $\BLK_BITS,%eax
    pushl   %eax                    #bits
    call    _Skein_Show_Round
    addl    $4*4,%esp               #discard parameter space on stack
    popal                           #restore regs
  .if \saveRegs
    call  _Get_XMM_\BLK_BITS        #save internal vars for debug dump
  .endif
.endm  #Skein_Debug_Round
.endif #ifdef SKEIN_DEBUG
#
#----------------------------------------------------------------
# useful macros
.macro _ldX xn
    movq          X_stk+8*(\xn)(%esp),%xmm\xn
.endm

.macro _stX xn
    movq  %xmm\xn,X_stk+8*(\xn)(%esp)
.endm
#
#----------------------------------------------------------------
#
.macro C_label lName
 \lName:        #use both "genders" to work across linkage conventions
_\lName:
    .global  \lName
    .global _\lName
.endm
#

.if _USE_ASM_ & 256
#
# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
#
#################
#
# Skein-256 round macros
#
.macro R_256_OneRound _RR_,x0,x1,x2,x3,t0,t1
  .irp _qq_,%((\_RR_) && 7)        #figure out which rotation constants to use
    .if \x0 == 0
_RC0_ =   RC_256_\_qq_&&_0
_RC1_ =   RC_256_\_qq_&&_1
    .else
_RC0_ =   RC_256_\_qq_&&_1
_RC1_ =   RC_256_\_qq_&&_0
    .endif
  .endr
#
    paddq    %xmm\x1,%xmm\x0
    movq     %xmm\x1,%xmm\t0
    psllq  $   _RC0_,%xmm\x1
    psrlq  $64-_RC0_,%xmm\t0
    xorpd    %xmm\x0,%xmm\x1
    xorpd    %xmm\t0,%xmm\x1
#                         
    paddq    %xmm\x3,%xmm\x2
    movq     %xmm\x3,%xmm\t1
    psllq  $   _RC1_,%xmm\x3
    psrlq  $64-_RC1_,%xmm\t1
    xorpd    %xmm\x2,%xmm\x3
    xorpd    %xmm\t1,%xmm\x3
  .if _SKEIN_DEBUG
    Skein_Debug_Round 256,%(\_RR_+1),SAVE_REGS
  .endif
.endm #R_256_OneRound
#
.macro R_256_FourRounds _RN_
    R_256_OneRound %(_RN_+0),0,1,2,3,4,5
    R_256_OneRound (_RN_+1),2,1,0,3,4,5

    R_256_OneRound (_RN_+2),0,1,2,3,4,5
    R_256_OneRound (_RN_+3),2,1,0,3,4,5

    #inject key schedule
    incl  %edx                     #bump round number
    movd  %edx,%xmm4
  .if _UNROLL_CNT == (ROUNDS_256/8)
    #fully unrolled version
_RK_ = ((_RN_)/4)                 #key injection counter
    paddq ksKey+16*((_RK_+1) % 5)-F_O(%ebp),%xmm0
    paddq ksKey+16*((_RK_+2) % 5)-F_O(%ebp),%xmm1
    paddq ksKey+16*((_RK_+3) % 5)-F_O(%ebp),%xmm2
    paddq ksKey+16*((_RK_+4) % 5)-F_O(%ebp),%xmm3
    paddq ksTwk+16*((_RK_+1) % 3)-F_O(%ebp),%xmm1
    paddq ksTwk+16*((_RK_+2) % 3)-F_O(%ebp),%xmm2
    paddq %xmm4,%xmm3
  .else #looping version
    paddq ksKey+16*1-F_O(%esi),%xmm0
    paddq ksKey+16*2-F_O(%esi),%xmm1
    paddq ksKey+16*3-F_O(%esi),%xmm2
    paddq ksKey+16*4-F_O(%esi),%xmm3
    paddq ksTwk+16*1-F_O(%esi),%xmm1
    paddq ksTwk+16*2-F_O(%esi),%xmm2
    paddq %xmm4,%xmm3
#   
    movq        ksKey-F_O(%esi),%xmm4   #first, "rotate" key schedule on the stack
    movq        ksTwk-F_O(%esi),%xmm5   #    (for next time through)
    movq  %xmm4,ksKey+16*(WCNT+1)-F_O(%esi)
    movq  %xmm5,ksTwk+16*3-F_O(%esi)
    addl  $16,%esi                     #bump rolling pointer
  .endif
  .if _SKEIN_DEBUG
      Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,SAVE_REGS
  .endif
.endm #R256_FourRounds
#
.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines
_Put_XMM_256:
  .irp _NN_,0,1,2,3
    movq  %xmm\_NN_,X_stk+4+\_NN_*8(%esp)
  .endr
    ret
#
_Get_XMM_256:
  .irp _NN_,0,1,2,3
    movq            X_stk+4+_NN_*8(%esp),%xmm\_NN_
  .endr
    ret
.endif
#
#################
#
# code
#
C_label Skein_256_Process_Block
    WCNT    =   4                   #WCNT=4 for Skein-256
    Setup_Stack WCNT,ROUNDS_256
    # main hash loop for Skein_256
Skein_256_block_loop:
    movd    bitAdd (%ebx),%xmm4
    movq    TWEAK+0(%edi),%xmm5
    movq    TWEAK+8(%edi),%xmm6
    paddq   %xmm4        ,%xmm5     #bump T0 by the bitAdd parameter
    movq    %xmm5,TWEAK(%edi)       #save updated tweak value T0 (for next time)
    movapd  %xmm6,%xmm7
    xorpd   %xmm5,%xmm7             #compute overall tweak parity
    movdqa  %xmm5,ksTwk   -F_O(%ebp)#save the expanded tweak schedule on the stack
    movdqa  %xmm6,ksTwk+16-F_O(%ebp)        
    movdqa  %xmm7,ksTwk+32-F_O(%ebp)        

    movl    blkPtr(%ebx),%esi       #esi --> input block
    movl    $KW_PARITY_LO,%eax      #init key schedule parity accumulator
    movl    $KW_PARITY_HI,%edx 
    movd    %eax ,%xmm4
    movd    %edx ,%xmm0
    unpcklps %xmm0,%xmm4            #replicate parity dword to 64 bits
#
  .irp _NN_,0,1,2,3                 #copy in the chaining vars
    movq    X_VARS+8*\_NN_(%edi),%xmm\_NN_
    xorpd   %xmm\_NN_,%xmm4         #update overall parity
    movdqa  %xmm\_NN_,ksKey+16*_NN_-F_O(%ebp)
  .endr
    movdqa  %xmm4,ksKey+16*WCNT-F_O(%ebp)#save overall parity at the end of the array
#
    paddq   %xmm5,%xmm1             #inject the initial tweak words
    paddq   %xmm6,%xmm2
#
  .irp _NN_,0,1,2,3                 #perform the initial key injection
    movq          8*\_NN_(%esi),%xmm4#and save a copy of the input block on stack
    movq    %xmm4,8*\_NN_+Wcopy(%esp)
    paddq   %xmm4,%xmm\_NN_         #inject the key word
  .endr
#
.if _SKEIN_DEBUG                    #debug dump of state at this point
    Skein_Debug_Block 256
    Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL,SAVE_REGS
.endif
    addl    $WCNT*8,%esi            #skip to the next block
    movl    %esi,blkPtr(%ebx)       #save the updated block pointer
    #
    # now the key schedule is computed. Start the rounds
    #
    xorl    %edx,%edx               #edx = iteration count
.if SKEIN_ASM_UNROLL & 256
_UNROLL_CNT =   ROUNDS_256/8        #fully unrolled
.else
_UNROLL_CNT =   SKEIN_UNROLL_256    #partial unroll count
  .if ((ROUNDS_256/8) % _UNROLL_CNT)
    .error "Invalid SKEIN_UNROLL_256" #sanity check
  .endif
    movl    %ebp,%esi               #use this as "rolling" pointer into ksTwk/ksKey
Skein_256_round_loop:               #   (since there's no 16* scaled address mode)
.endif
#
_Rbase_ = 0
.rept _UNROLL_CNT*2                  # here with X[0..3] in XMM0..XMM3
      R_256_FourRounds _Rbase_
_Rbase_ = _Rbase_+4
.endr #rept _UNROLL_CNT*2
#
  .if _UNROLL_CNT <> (ROUNDS_256/8)
    cmpl    $2*(ROUNDS_256/8),%edx
    jb      Skein_256_round_loop
  .endif
    #----------------------------
    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
  .irp _NN_,0,1,2,3
    movq    Wcopy+8*\_NN_(%esp),%xmm4
    xorpd   %xmm4,%xmm\_NN_
    movq    %xmm\_NN_,X_VARS+8*\_NN_(%edi)
  .endr
    andb    $FIRST_MASK8,TWEAK +15(%edi)
.if _SKEIN_DEBUG
    Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,SAVE_REGS
.endif
    # go back for more blocks, if needed
    decl    %ecx
    jnz     Skein_256_block_loop
    Reset_Stack _Skein_256_Process_Block
    ret
#
.ifdef _SKEIN_CODE_SIZE
C_label  Skein_256_Process_Block_CodeSize
    movl    $_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block,%eax
    ret
#
C_label  Skein_256_Unroll_Cnt
  .if _UNROLL_CNT <> ROUNDS_256/8
    movl    $_UNROLL_CNT,%eax
  .else
    xorl    %eax,%eax
  .endif
    ret
.endif
.endif #_USE_ASM_ & 256
#
#----------------------------------------------------------------
#
.if _USE_ASM_ & 512
#
# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
#
#################
# MACRO: one round
#
.macro R_512_Round _RR_, a0,a1,Ra, b0,b1,Rb, c0,c1,Rc, d0,d1,Rd
  .irp _qq_,%((\_RR_) && 7)
_Ra_ = RC_512_\_qq_&&_\Ra
_Rb_ = RC_512_\_qq_&&_\Rb
_Rc_ = RC_512_\_qq_&&_\Rc
_Rd_ = RC_512_\_qq_&&_\Rd
  .endr
    paddq   %xmm\a1 , %xmm\a0 
                              _stX c0
    movq    %xmm\a1 , %xmm\c0 
    psllq  $   _Ra_ , %xmm\a1 
    psrlq  $64-_Ra_ , %xmm\c0 
    xorpd   %xmm\c0 , %xmm\a1 
    xorpd   %xmm\a0 , %xmm\a1 
                                    
    paddq   %xmm\b1 , %xmm\b0 
                              _stX a0
    movq    %xmm\b1 , %xmm\a0 
    psllq  $   _Rb_ , %xmm\b1 
    psrlq  $64-_Rb_ , %xmm\a0 
    xorpd   %xmm\b0 , %xmm\b1 
                              _ldX c0
    xorpd   %xmm\a0 , %xmm\b1 
                               
    paddq   %xmm\c1 , %xmm\c0 
    movq    %xmm\c1 , %xmm\a0 
    psllq  $   _Rc_ , %xmm\c1 
    psrlq  $64-_Rc_ , %xmm\a0 
    xorpd   %xmm\c0 , %xmm\c1 
    xorpd   %xmm\a0 , %xmm\c1 
                               
    paddq   %xmm\d1 , %xmm\d0 
    movq    %xmm\d1 , %xmm\a0           
    psllq  $   _Rd_ , %xmm\d1 
    psrlq  $64-_Rd_ , %xmm\a0 
    xorpd   %xmm\a0 , %xmm\d1 
                              _ldX a0
    xorpd   %xmm\d0 , %xmm\d1 
  .if _SKEIN_DEBUG
    Skein_Debug_Round 512,%(_RR_+1),SAVE_REGS
  .endif
.endm
#
# MACRO: four rounds
.macro R_512_FourRounds _RN_
    R_512_Round %((_RN_)  ), 0,1,0, 2,3,1, 4,5,2, 6,7,3
    R_512_Round %((_RN_)+1), 2,1,0, 4,7,1, 6,5,2, 0,3,3
    R_512_Round %((_RN_)+2), 4,1,0, 6,3,1, 0,5,2, 2,7,3
    R_512_Round %((_RN_)+3), 6,1,0, 0,7,1, 2,5,2, 4,3,3

    #inject key schedule
.irp _NN_,0,1,2,3,4,5,6,7
  .if _UNROLL_CNT == (ROUNDS_512/8)
    paddq ksKey+16*((((\_RN_)/4)+(\_NN_)+1)%9)-F_O(%ebp),%xmm\_NN_
  .else
    paddq ksKey+16*((\_NN_)+1)-F_O(%esi),%xmm\_NN_
  .endif
.endr
    _stX  0                       #free up a register
    incl  %edx                    #bump round counter
    movd  %edx,%xmm0              #inject the tweak
  .if _UNROLL_CNT == (ROUNDS_512/8)
    paddq ksTwk+16*(((_RN_)+1) % 3)-F_O(%ebp),%xmm5
    paddq ksTwk+16*(((_RN_)+2) % 3)-F_O(%ebp),%xmm6
    paddq %xmm0                              ,%xmm7
  .else #looping version
    paddq ksTwk+16*1-F_O(%esi),%xmm5
    paddq ksTwk+16*2-F_O(%esi),%xmm6
    paddq %xmm0               ,%xmm7
    # "rotate" key schedule on the stack (for next time through)
    movq        ksKey            -F_O(%esi),%xmm0
    movq  %xmm0,ksKey+16*(WCNT+1)-F_O(%esi)
    movq        ksTwk            -F_O(%esi),%xmm0
    movq  %xmm0,ksTwk+16*3       -F_O(%esi)
    addl  $16,%esi                #bump rolling pointer
  .endif
    _ldX  0                       #restore X0
  .if _SKEIN_DEBUG
    Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT,SAVE_REGS
  .endif
.endm #R_512_FourRounds
#################
.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines
_Put_XMM_512:
  .irp _NN_,0,1,2,3,4,5,6,7
    movq  %xmm\_NN_,X_stk+4+\_NN_*8(%esp)
  .endr
    ret
#
_Get_XMM_512:
  .irp _NN_,0,1,2,3,4,5,6,7
    movq            X_stk+4+\_NN_*8(%esp),%xmm\_NN_
  .endr
    ret
.endif
#
#################
#
C_label Skein_512_Process_Block
    WCNT    =   8                   #WCNT=8 for Skein-512
    Setup_Stack WCNT,ROUNDS_512
    # main hash loop for Skein_512
Skein_512_block_loop:
    movd    bitAdd(%ebx) ,%xmm0
    movq    TWEAK+0(%edi),%xmm1
    movq    TWEAK+8(%edi),%xmm2
    paddq   %xmm0,%xmm1               #bump T0 by the bitAdd parameter
    movq    %xmm1,TWEAK(%edi)         #save updated tweak value T0 (for next time)
    movq    %xmm2,%xmm0
    xorpd   %xmm1,%xmm0               #compute overall tweak parity
    movdqa  %xmm1,ksTwk     -F_O(%ebp)#save the expanded tweak schedule on the stack
    movdqa  %xmm2,ksTwk+16*1-F_O(%ebp)    
    movdqa  %xmm0,ksTwk+16*2-F_O(%ebp)    

    movl    blkPtr(%ebx),%esi         #esi --> input block
    movl    $KW_PARITY_LO,%eax        #init key schedule parity accumulator
    movl    $KW_PARITY_HI,%edx 
    movd    %eax ,%xmm0
    movd    %edx ,%xmm7
    unpcklps %xmm7,%xmm0              #replicate parity dword to 64 bits
#
  .irp _NN_,7,6,5,4,3,2,1             #copy in the chaining vars (skip #0 for now)
    movq    X_VARS+8*\_NN_(%edi),%xmm\_NN_
    xorpd   %xmm\_NN_,%xmm0           #update overall parity
    movdqa  %xmm\_NN_,ksKey+16*\_NN_-F_O(%ebp)
   .if \_NN_ == 5
    paddq   %xmm1,%xmm5               #inject the initial tweak words
    paddq   %xmm2,%xmm6               #  (before they get trashed in %xmm1/2)
   .endif
  .endr
    movq    X_VARS(%edi),%xmm4        #handle #0 now
    xorpd   %xmm4,%xmm0               #update overall parity
    movdqa  %xmm4,ksKey+16* 0  -F_O(%ebp) #save the key value in slot #0
    movdqa  %xmm0,ksKey+16*WCNT-F_O(%ebp) #save overall parity at the end of the array
#
    movq    %xmm4,%xmm0
  .irp _NN_,7,6,5,  4,3,2,1,0         #perform the initial key injection (except #4)
    movq    8*\_NN_(%esi),%xmm4       #and save a copy of the input block on stack
    movq    %xmm4,8*\_NN_+Wcopy(%esp)
    paddq   %xmm4,%xmm\_NN_
  .endr
    movq    8*4(%esi),%xmm4           #get input block word #4
    movq    %xmm4,8*4+Wcopy(%esp)
    paddq   ksKey+16*4-F_O(%ebp),%xmm4#inject the initial key
#
.if _SKEIN_DEBUG                      #debug dump of state at this point
    Skein_Debug_Block 512
    Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL,SAVE_REGS
.endif
    addl    $WCNT*8,%esi              #skip to the next block
    movl    %esi,blkPtr(%ebx)         #save the updated block pointer
    #
    # now the key schedule is computed. Start the rounds
    #
    xorl    %edx,%edx                 #edx = round counter
.if SKEIN_ASM_UNROLL & 512
_UNROLL_CNT =   ROUNDS_512/8
.else
_UNROLL_CNT =   SKEIN_UNROLL_512
  .if ((ROUNDS_512/8) % _UNROLL_CNT)
    .error "Invalid SKEIN_UNROLL_512"
  .endif
    movl    %ebp,%esi                 #use this as "rolling" pointer into ksTwk/ksKey
Skein_512_round_loop:                 #   (since there's no 16* scaled address mode)
.endif
_Rbase_ = 0
.rept _UNROLL_CNT*2
      R_512_FourRounds %_Rbase_
_Rbase_ = _Rbase_+4
.endr #rept _UNROLL_CNT
#
.if (SKEIN_ASM_UNROLL & 512) == 0
    cmpl    $2*(ROUNDS_512/8),%edx
    jb      Skein_512_round_loop
.endif
    #----------------------------
    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
    andb    $FIRST_MASK8,TWEAK +15(%edi)
.irp _NN_,0,2,4,6                   #do the aligned ones first
    xorpd   Wcopy+8*\_NN_(%esp),%xmm\_NN_
    movq    %xmm\_NN_,X_VARS+8*_NN_(%edi)
.endr
.irp _NN_,1,3,5,7                   #now we have some register space available
    movq    Wcopy+8*\_NN_(%esp),%xmm0
    xorpd   %xmm0,%xmm&\_NN_
    movq    %xmm&\_NN_,X_VARS+8*\_NN_(%edi)
.endr
.if _SKEIN_DEBUG
    Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
.endif
    # go back for more blocks, if needed
    decl    %ecx
    jnz     Skein_512_block_loop

    Reset_Stack _Skein_512_Process_Block
    ret
#
.ifdef _SKEIN_CODE_SIZE
C_label Skein_512_Process_Block_CodeSize
    movl    $(_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block),%eax
    ret
#
C_label Skein_512_Unroll_Cnt
  .if _UNROLL_CNT <> ROUNDS_512/8
    movl    $_UNROLL_CNT,%eax
  .else
    xorl    %eax,%eax
  .endif
    ret
.endif
#
.endif # _USE_ASM_ & 512
#
#----------------------------------------------------------------
#
.if _USE_ASM_ & 1024
    .global      _Skein1024_Process_Block
#
# void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
#
R_1024_REGS =     (5)     #keep this many block variables in registers
#
################
.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines
_Put_XMM_1024:
_NN_ = 0
  .rept R_1024_REGS
   .irp _rr_,%(_NN_)
    movq   %xmm\_rr_,X_stk+4+8*_NN_(%esp)
   .endr
_NN_ = _NN_+1
  .endr
    ret
#
_Get_XMM_1024:
_NN_ = 0
  .rept R_1024_REGS
   .irp _rr_,%(_NN_)
    movq             X_stk+4+8*_NN_(%esp),%xmm\_rr_
   .endr
_NN_ = _NN_+1
  .endr
    ret
.endif
#
#################
# MACRO: one mix step
.macro MixStep_1024  x0,x1,rotIdx0,rotIdx1,_debug_=0
_r0_ =  \x0      #default, if already loaded
_r1_ =  \x1
  # load the regs (if necessary)
  .if (\x0 >= R_1024_REGS)
_r0_ =       5
    movq    X_stk+8*(\x0)(%esp),%xmm5
  .endif
  .if (\x1 >= R_1024_REGS)
_r1_ =       6     
    movq  X_stk+8*(\x1)(%esp),%xmm6
  .endif
  # do the mix
  .irp _rx_,%((rotIdx0) && 7)
_Rc_ = RC_1024_\_rx_&&_\rotIdx1  #rotation constant
  .endr
  .irp _x0_,%_r0_
  .irp _x1_,%_r1_
    paddq   %xmm\_x1_,%xmm\_x0_
    movq    %xmm\_x1_,%xmm7    
    psllq  $   _Rc_  ,%xmm\_x1_
    psrlq  $64-_Rc_  ,%xmm7    
    xorpd   %xmm\_x0_,%xmm\_x1_
    xorpd   %xmm7    ,%xmm\_x1_
  .endr
  .endr
  # save the regs (if necessary)
  .if (\x0 >= R_1024_REGS)
    movq    %xmm5,X_stk+8*(\x0)(%esp)
  .endif
  .if (\x1 >= R_1024_REGS)
    movq    %xmm6,X_stk+8*(\x1)(%esp)
  .endif
  # debug output
  .if _SKEIN_DEBUG && (\_debug_)
    Skein_Debug_Round 1024,%((\RotIdx0)+1),SAVE_REGS
  .endif
.endm
#################
# MACRO: four rounds
#
.macro R_1024_FourRounds _RR_
    #--------- round _RR_
    MixStep_1024     0, 1,%((\_RR_)+0),0
    MixStep_1024     2, 3,%((\_RR_)+0),1
    MixStep_1024     4, 5,%((\_RR_)+0),2
    MixStep_1024     6, 7,%((\_RR_)+0),3
    MixStep_1024     8, 9,%((\_RR_)+0),4
    MixStep_1024    10,11,%((\_RR_)+0),5
    MixStep_1024    12,13,%((\_RR_)+0),6
    MixStep_1024    14,15,%((\_RR_)+0),7,1
    #--------- round _RR_+1
    MixStep_1024     0, 9,%((\_RR_)+1),0
    MixStep_1024     2,13,%((\_RR_)+1),1
    MixStep_1024     6,11,%((\_RR_)+1),2
    MixStep_1024     4,15,%((\_RR_)+1),3
    MixStep_1024    10, 7,%((\_RR_)+1),4
    MixStep_1024    12, 3,%((\_RR_)+1),5
    MixStep_1024    14, 5,%((\_RR_)+1),6
    MixStep_1024     8, 1,%((\_RR_)+1),7,1
    #--------- round _RR_+2
    MixStep_1024     0, 7,%((\_RR_)+2),0    
    MixStep_1024     2, 5,%((\_RR_)+2),1
    MixStep_1024     4, 3,%((\_RR_)+2),2    
    MixStep_1024     6, 1,%((\_RR_)+2),3    
    MixStep_1024    12,15,%((\_RR_)+2),4
    MixStep_1024    14,13,%((\_RR_)+2),5    
    MixStep_1024     8,11,%((\_RR_)+2),6    
    MixStep_1024    10, 9,%((\_RR_)+2),7,1
    #--------- round _RR_+3
    MixStep_1024     0,15,%((\_RR_)+3),0
    MixStep_1024     2,11,%((\_RR_)+3),1
    MixStep_1024     6,13,%((\_RR_)+3),2
    MixStep_1024     4, 9,%((\_RR_)+3),3
    MixStep_1024    14, 1,%((\_RR_)+3),4
    MixStep_1024     8, 5,%((\_RR_)+3),5
    MixStep_1024    10, 3,%((\_RR_)+3),6
    MixStep_1024    12, 7,%((\_RR_)+3),7,1

    incl  %edx                     #edx = round number
    movd  %edx,%xmm7

    #inject the key
.irp _NN_,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  .if _UNROLL_CNT <> (ROUNDS_1024/8)
    .if \_NN_ < R_1024_REGS
      paddq ksKey+16*\_NN_+16-F_O(%esi),%xmm&\_NN_
    .else
      movq  X_stk+ 8*\_NN_(%esp),%xmm6
     .if     \_NN_ == 15
      paddq %xmm7,%xmm6
     .elseif \_NN_ == 14
      paddq ksTwk+16*2-F_O(%esi),%xmm6
     .elseif \_NN_ == 13
      paddq ksTwk+16*1-F_O(%esi),%xmm6
     .endif
      paddq       ksKey+16*\_NN_+16-F_O(%esi),%xmm6
      movq  %xmm6,X_stk+ 8*\_NN_(%esp)
    .endif
  .else
    .if \_NN_ < R_1024_REGS
      paddq ksKey+16*(((_Rbase_/4)+(\_NN_)+1) % 17)-F_O(%ebp),%xmm&\_NN_
    .else
      movq  X_stk+ 8*\_NN_(%esp), %xmm6
      paddq ksKey+16*(((_Rbase_/4)+(\_NN_)+1) % 17)-F_O(%ebp),%xmm6
     .if     \_NN_ == 15
      paddq %xmm7,%xmm6
     .elseif \_NN_ == 14
      paddq ksTwk+16*(((_Rbase_/4)+2) %  3)-F_O(%ebp),%xmm6
     .elseif \_NN_ == 13
      paddq ksTwk+16*(((_Rbase_/4)+1) %  3)-F_O(%ebp),%xmm6
     .endif
      movq %xmm6,X_stk+ 8*\_NN_(%esp)
    .endif
  .endif
.endr
  .if _UNROLL_CNT <> (ROUNDS_1024/8) #rotate the key schedule on the stack
    movq ksKey-F_O(%esi), %xmm6
    movq ksTwk-F_O(%esi), %xmm7
    movq %xmm6,ksKey+16*(WCNT+1)-F_O(%esi)
    movq %xmm7,ksTwk+16* 3      -F_O(%esi)
    addl $16,%esi                   #bump rolling pointer
  .endif
  .if _SKEIN_DEBUG
      Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,SAVE_REGS
  .endif
.endm #R_1024_FourRounds
#
################
#
C_label Skein1024_Process_Block
#
    WCNT    =   16                  #WCNT=16 for Skein-1024
    Setup_Stack WCNT,ROUNDS_1024
    addl    $0x80,%edi              #bias the edi ctxt offsets to keep them all short
    # main hash loop for Skein1024
Skein1024_block_loop:
    movd    bitAdd(%ebx)      ,%xmm0
    movq    TWEAK+0-0x80(%edi),%xmm1
    movq    TWEAK+8-0x80(%edi),%xmm2
    paddq   %xmm0,%xmm1             #bump T0 by the bitAdd parameter
    movq    %xmm1,TWEAK-0x80(%edi)  #save updated tweak value T0 (for next time)
    movq    %xmm2,%xmm0
    xorpd   %xmm1,%xmm0             #compute overall tweak parity
    movdqa  %xmm1,ksTwk   -F_O(%ebp)#save the expanded tweak schedule on the stack
    movdqa  %xmm2,ksTwk+16-F_O(%ebp)
    movdqa  %xmm0,ksTwk+32-F_O(%ebp)

    movl    blkPtr(%ebx),%esi       #esi --> input block
    movl    $KW_PARITY_LO,%eax      #init key schedule parity accumulator
    movl    $KW_PARITY_HI,%edx 
    movd    %eax ,%xmm7
    movd    %edx ,%xmm6
    unpcklps %xmm6,%xmm7            #replicate parity dword to 64 bits
#
    leal    0x80(%esp),%eax         #use short offsets for Wcopy, X_stk writes below
.irp _NN_,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
    movq    X_VARS+8*\_NN_-0x80(%edi),%xmm6
    xorpd   %xmm6,%xmm7             #update overall parity
    movdqa  %xmm6,ksKey+16*\_NN_-F_O(%ebp) #save the key schedule on the stack
  .if \_NN_ < R_1024_REGS
    _rr_  =  \_NN_
  .else
    _rr_  =   R_1024_REGS
  .endif
  .irp _rn_,%(_rr_)
    movq    8*\_NN_(%esi),%xmm\_rn_ #save copy of the input block on stack
    movq    %xmm\_rn_,Wcopy+8*\_NN_-0x80(%eax)  #(for feedforward later)
    paddq   %xmm6,%xmm\_rn_         #inject the key into the block
   .if \_NN_ == 13
    paddq   %xmm1,%xmm\_rn_         #inject the initial tweak words
   .elseif \_NN_ == 14
    paddq   %xmm2,%xmm\_rn_
   .endif
   .if \_NN_ >= R_1024_REGS         #only save X[5..15] on stack, leave X[0..4] in regs
    movq    %xmm\_rn_,X_stk+8*\_NN_-0x80(%eax)
   .endif
  .endr
.endr
    movdqa  %xmm7,ksKey+16*WCNT-F_O(%ebp) #save overall key parity at the end of the array
#
.if _SKEIN_DEBUG                    #debug dump of state at this point
    Skein_Debug_Block 1024
    Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL,SAVE_REGS
.endif
    addl    $WCNT*8,%esi            #skip to the next block
    movl    %esi,blkPtr(%ebx)       #save the updated block pointer
    #
    # now the key schedule is computed. Start the rounds
    #
    xorl    %edx,%edx               #edx = round counter
.if SKEIN_ASM_UNROLL & 1024
_UNROLL_CNT =   ROUNDS_1024/8
.else
_UNROLL_CNT =   SKEIN_UNROLL_1024
  .if ((ROUNDS_1024/8) % _UNROLL_CNT)
    .error "Invalid SKEIN_UNROLL_1024"
  .endif
    movl    %ebp,%esi               #use this as "rolling" pointer into ksTwk/ksKey
Skein_1024_round_loop:
.endif
#
_Rbase_ = 0
.rept _UNROLL_CNT*2
    R_1024_FourRounds %_Rbase_
_Rbase_ = _Rbase_+4
.endr #rept _UNROLL_CNT
#
.if (SKEIN_ASM_UNROLL & 1024) == 0
    cmp     $2*(ROUNDS_1024/8),%edx
    jb      Skein_1024_round_loop
.endif
    andb    $FIRST_MASK8,TWEAK +15-0x80(%edi)      #clear tweak bit for next time thru
    #----------------------------
    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..15}
    leal    0x80(%esp),%eax                        #allow short offsets to X_stk and Wcopy
.irp _NN_,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
  .if \_NN_ < R_1024_REGS
    .if \_NN_ && 1                                 #already in regs: no load needed
      movq  Wcopy+ 8*\_NN_-0x80(%eax),%xmm7        #unaligned
      xorpd %xmm7,%xmm\_NN_
    .else
      xorpd Wcopy+ 8*\_NN_-0x80(%eax),%xmm\_NN_    #aligned
    .endif
      movq  %xmm\_NN_,X_VARS+8*\_NN_-0x80(%edi)
  .else
      movq    X_stk+8*\_NN_-0x80(%eax),%xmm7       #load X value from stack
    .if \_NN_ && 1
      movq    Wcopy+8*\_NN_-0x80(%eax),%xmm6       #unaligned
      xorpd   %xmm6,%xmm7
    .else
      xorpd   Wcopy+8*\_NN_-0x80(%eax),%xmm7       #aligned
    .endif
      movq    %xmm7,X_VARS+8*\_NN_-0x80(%edi)
 .endif
.endr
.if _SKEIN_DEBUG
    Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD   #no need to save regs on stack here
.endif
    # go back for more blocks, if needed
    decl    %ecx
    jnz     Skein1024_block_loop

    Reset_Stack _Skein1024_Process_Block
    ret
#
.ifdef _SKEIN_CODE_SIZE
C_label Skein1024_Process_Block_CodeSize
    movl    $(_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block),%eax
    ret
#
C_label Skein1024_Unroll_Cnt
  .if _UNROLL_CNT <> ROUNDS_1024/8
    movl    $_UNROLL_CNT,%eax
  .else
    xorl    %eax,%eax
  .endif
    ret
.endif
#
.endif # _USE_ASM_ & 1024
#----------------------------------------------------------------
    .end