1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# I let hardware handle unaligned input(*), except on page boundaries 11238384Sjkim# (see below for details). Otherwise straightforward implementation 12238384Sjkim# with X vector in register bank. The module is big-endian [which is 13238384Sjkim# not big deal as there're no little-endian targets left around]. 14238384Sjkim# 15238384Sjkim# (*) this means that this module is inappropriate for PPC403? Does 16238384Sjkim# anybody know if pre-POWER3 can sustain unaligned load? 17238384Sjkim 18238384Sjkim# -m64 -m32 19238384Sjkim# ---------------------------------- 20238384Sjkim# PPC970,gcc-4.0.0 +76% +59% 21238384Sjkim# Power6,xlc-7 +68% +33% 22238384Sjkim 23238384Sjkim$flavour = shift; 24238384Sjkim 25238384Sjkimif ($flavour =~ /64/) { 26238384Sjkim $SIZE_T =8; 27238384Sjkim $LRSAVE =2*$SIZE_T; 28238384Sjkim $UCMP ="cmpld"; 29238384Sjkim $STU ="stdu"; 30238384Sjkim $POP ="ld"; 31238384Sjkim $PUSH ="std"; 32238384Sjkim} elsif ($flavour =~ /32/) { 33238384Sjkim $SIZE_T =4; 34238384Sjkim $LRSAVE =$SIZE_T; 35238384Sjkim $UCMP ="cmplw"; 36238384Sjkim $STU ="stwu"; 37238384Sjkim $POP ="lwz"; 38238384Sjkim $PUSH ="stw"; 39238384Sjkim} else { die "nonsense $flavour"; } 40238384Sjkim 41238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42238384Sjkim( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 43238384Sjkim( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 44238384Sjkimdie "can't locate ppc-xlate.pl"; 45238384Sjkim 46238384Sjkimopen STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 47238384Sjkim 48238384Sjkim$FRAME=24*$SIZE_T+64; 49238384Sjkim$LOCALS=6*$SIZE_T; 50238384Sjkim 51238384Sjkim$K ="r0"; 52238384Sjkim$sp ="r1"; 53238384Sjkim$toc="r2"; 54238384Sjkim$ctx="r3"; 55238384Sjkim$inp="r4"; 56238384Sjkim$num="r5"; 57238384Sjkim$t0 ="r15"; 58238384Sjkim$t1 ="r6"; 59238384Sjkim 60238384Sjkim$A ="r7"; 61238384Sjkim$B ="r8"; 62238384Sjkim$C ="r9"; 63238384Sjkim$D ="r10"; 64238384Sjkim$E ="r11"; 65238384Sjkim$T ="r12"; 66238384Sjkim 67238384Sjkim@V=($A,$B,$C,$D,$E,$T); 68238384Sjkim@X=("r16","r17","r18","r19","r20","r21","r22","r23", 69238384Sjkim "r24","r25","r26","r27","r28","r29","r30","r31"); 70238384Sjkim 71238384Sjkimsub BODY_00_19 { 72238384Sjkimmy ($i,$a,$b,$c,$d,$e,$f)=@_; 73238384Sjkimmy $j=$i+1; 74238384Sjkim$code.=<<___ if ($i==0); 75238384Sjkim lwz @X[$i],`$i*4`($inp) 76238384Sjkim___ 77238384Sjkim$code.=<<___ if ($i<15); 78238384Sjkim lwz @X[$j],`$j*4`($inp) 79238384Sjkim add $f,$K,$e 80238384Sjkim rotlwi $e,$a,5 81238384Sjkim add $f,$f,@X[$i] 82238384Sjkim and $t0,$c,$b 83238384Sjkim add $f,$f,$e 84238384Sjkim andc $t1,$d,$b 85238384Sjkim rotlwi $b,$b,30 86238384Sjkim or $t0,$t0,$t1 87238384Sjkim add $f,$f,$t0 88238384Sjkim___ 89238384Sjkim$code.=<<___ if ($i>=15); 90238384Sjkim add $f,$K,$e 91238384Sjkim rotlwi $e,$a,5 92238384Sjkim xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 93238384Sjkim add $f,$f,@X[$i%16] 94238384Sjkim and $t0,$c,$b 95238384Sjkim xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 96238384Sjkim add $f,$f,$e 97238384Sjkim andc $t1,$d,$b 98238384Sjkim rotlwi $b,$b,30 99238384Sjkim or $t0,$t0,$t1 100238384Sjkim xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 101238384Sjkim add $f,$f,$t0 102238384Sjkim rotlwi @X[$j%16],@X[$j%16],1 103238384Sjkim___ 104238384Sjkim} 105238384Sjkim 106238384Sjkimsub BODY_20_39 { 107238384Sjkimmy ($i,$a,$b,$c,$d,$e,$f)=@_; 108238384Sjkimmy $j=$i+1; 109238384Sjkim$code.=<<___ if ($i<79); 110238384Sjkim add $f,$K,$e 111238384Sjkim rotlwi $e,$a,5 112238384Sjkim xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 113238384Sjkim add $f,$f,@X[$i%16] 114238384Sjkim xor $t0,$b,$c 115238384Sjkim xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 116238384Sjkim add $f,$f,$e 117238384Sjkim rotlwi $b,$b,30 118238384Sjkim xor $t0,$t0,$d 119238384Sjkim xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 120238384Sjkim add $f,$f,$t0 121238384Sjkim rotlwi @X[$j%16],@X[$j%16],1 122238384Sjkim___ 123238384Sjkim$code.=<<___ if ($i==79); 124238384Sjkim add $f,$K,$e 125238384Sjkim rotlwi $e,$a,5 126238384Sjkim lwz r16,0($ctx) 127238384Sjkim add $f,$f,@X[$i%16] 128238384Sjkim xor $t0,$b,$c 129238384Sjkim lwz r17,4($ctx) 130238384Sjkim add $f,$f,$e 131238384Sjkim rotlwi $b,$b,30 132238384Sjkim lwz r18,8($ctx) 133238384Sjkim xor $t0,$t0,$d 134238384Sjkim lwz r19,12($ctx) 135238384Sjkim add $f,$f,$t0 136238384Sjkim lwz r20,16($ctx) 137238384Sjkim___ 138238384Sjkim} 139238384Sjkim 140238384Sjkimsub BODY_40_59 { 141238384Sjkimmy ($i,$a,$b,$c,$d,$e,$f)=@_; 142238384Sjkimmy $j=$i+1; 143238384Sjkim$code.=<<___; 144238384Sjkim add $f,$K,$e 145238384Sjkim rotlwi $e,$a,5 146238384Sjkim xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 147238384Sjkim add $f,$f,@X[$i%16] 148238384Sjkim and $t0,$b,$c 149238384Sjkim xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 150238384Sjkim add $f,$f,$e 151238384Sjkim or $t1,$b,$c 152238384Sjkim rotlwi $b,$b,30 153238384Sjkim xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 154238384Sjkim and $t1,$t1,$d 155238384Sjkim or $t0,$t0,$t1 156238384Sjkim rotlwi @X[$j%16],@X[$j%16],1 157238384Sjkim add $f,$f,$t0 158238384Sjkim___ 159238384Sjkim} 160238384Sjkim 161238384Sjkim$code=<<___; 162238384Sjkim.machine "any" 163238384Sjkim.text 164238384Sjkim 165238384Sjkim.globl .sha1_block_data_order 166238384Sjkim.align 4 167238384Sjkim.sha1_block_data_order: 168238384Sjkim $STU $sp,-$FRAME($sp) 169238384Sjkim mflr r0 170238384Sjkim $PUSH r15,`$FRAME-$SIZE_T*17`($sp) 171238384Sjkim $PUSH r16,`$FRAME-$SIZE_T*16`($sp) 172238384Sjkim $PUSH r17,`$FRAME-$SIZE_T*15`($sp) 173238384Sjkim $PUSH r18,`$FRAME-$SIZE_T*14`($sp) 174238384Sjkim $PUSH r19,`$FRAME-$SIZE_T*13`($sp) 175238384Sjkim $PUSH r20,`$FRAME-$SIZE_T*12`($sp) 176238384Sjkim $PUSH r21,`$FRAME-$SIZE_T*11`($sp) 177238384Sjkim $PUSH r22,`$FRAME-$SIZE_T*10`($sp) 178238384Sjkim $PUSH r23,`$FRAME-$SIZE_T*9`($sp) 179238384Sjkim $PUSH r24,`$FRAME-$SIZE_T*8`($sp) 180238384Sjkim $PUSH r25,`$FRAME-$SIZE_T*7`($sp) 181238384Sjkim $PUSH r26,`$FRAME-$SIZE_T*6`($sp) 182238384Sjkim $PUSH r27,`$FRAME-$SIZE_T*5`($sp) 183238384Sjkim $PUSH r28,`$FRAME-$SIZE_T*4`($sp) 184238384Sjkim $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 185238384Sjkim $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 186238384Sjkim $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 187238384Sjkim $PUSH r0,`$FRAME+$LRSAVE`($sp) 188238384Sjkim lwz $A,0($ctx) 189238384Sjkim lwz $B,4($ctx) 190238384Sjkim lwz $C,8($ctx) 191238384Sjkim lwz $D,12($ctx) 192238384Sjkim lwz $E,16($ctx) 193238384Sjkim andi. r0,$inp,3 194238384Sjkim bne Lunaligned 195238384SjkimLaligned: 196238384Sjkim mtctr $num 197238384Sjkim bl Lsha1_block_private 198238384Sjkim b Ldone 199238384Sjkim 200238384Sjkim; PowerPC specification allows an implementation to be ill-behaved 201238384Sjkim; upon unaligned access which crosses page boundary. "Better safe 202238384Sjkim; than sorry" principle makes me treat it specially. But I don't 203238384Sjkim; look for particular offending word, but rather for 64-byte input 204238384Sjkim; block which crosses the boundary. Once found that block is aligned 205238384Sjkim; and hashed separately... 206238384Sjkim.align 4 207238384SjkimLunaligned: 208238384Sjkim subfic $t1,$inp,4096 209238384Sjkim andi. $t1,$t1,4095 ; distance to closest page boundary 210238384Sjkim srwi. $t1,$t1,6 ; t1/=64 211238384Sjkim beq Lcross_page 212238384Sjkim $UCMP $num,$t1 213238384Sjkim ble- Laligned ; didn't cross the page boundary 214238384Sjkim mtctr $t1 215238384Sjkim subfc $num,$t1,$num 216238384Sjkim bl Lsha1_block_private 217238384SjkimLcross_page: 218238384Sjkim li $t1,16 219238384Sjkim mtctr $t1 220238384Sjkim addi r20,$sp,$LOCALS ; spot within the frame 221238384SjkimLmemcpy: 222238384Sjkim lbz r16,0($inp) 223238384Sjkim lbz r17,1($inp) 224238384Sjkim lbz r18,2($inp) 225238384Sjkim lbz r19,3($inp) 226238384Sjkim addi $inp,$inp,4 227238384Sjkim stb r16,0(r20) 228238384Sjkim stb r17,1(r20) 229238384Sjkim stb r18,2(r20) 230238384Sjkim stb r19,3(r20) 231238384Sjkim addi r20,r20,4 232238384Sjkim bdnz Lmemcpy 233238384Sjkim 234238384Sjkim $PUSH $inp,`$FRAME-$SIZE_T*18`($sp) 235238384Sjkim li $t1,1 236238384Sjkim addi $inp,$sp,$LOCALS 237238384Sjkim mtctr $t1 238238384Sjkim bl Lsha1_block_private 239238384Sjkim $POP $inp,`$FRAME-$SIZE_T*18`($sp) 240238384Sjkim addic. $num,$num,-1 241238384Sjkim bne- Lunaligned 242238384Sjkim 243238384SjkimLdone: 244238384Sjkim $POP r0,`$FRAME+$LRSAVE`($sp) 245238384Sjkim $POP r15,`$FRAME-$SIZE_T*17`($sp) 246238384Sjkim $POP r16,`$FRAME-$SIZE_T*16`($sp) 247238384Sjkim $POP r17,`$FRAME-$SIZE_T*15`($sp) 248238384Sjkim $POP r18,`$FRAME-$SIZE_T*14`($sp) 249238384Sjkim $POP r19,`$FRAME-$SIZE_T*13`($sp) 250238384Sjkim $POP r20,`$FRAME-$SIZE_T*12`($sp) 251238384Sjkim $POP r21,`$FRAME-$SIZE_T*11`($sp) 252238384Sjkim $POP r22,`$FRAME-$SIZE_T*10`($sp) 253238384Sjkim $POP r23,`$FRAME-$SIZE_T*9`($sp) 254238384Sjkim $POP r24,`$FRAME-$SIZE_T*8`($sp) 255238384Sjkim $POP r25,`$FRAME-$SIZE_T*7`($sp) 256238384Sjkim $POP r26,`$FRAME-$SIZE_T*6`($sp) 257238384Sjkim $POP r27,`$FRAME-$SIZE_T*5`($sp) 258238384Sjkim $POP r28,`$FRAME-$SIZE_T*4`($sp) 259238384Sjkim $POP r29,`$FRAME-$SIZE_T*3`($sp) 260238384Sjkim $POP r30,`$FRAME-$SIZE_T*2`($sp) 261238384Sjkim $POP r31,`$FRAME-$SIZE_T*1`($sp) 262238384Sjkim mtlr r0 263238384Sjkim addi $sp,$sp,$FRAME 264238384Sjkim blr 265238384Sjkim .long 0 266238384Sjkim .byte 0,12,4,1,0x80,18,3,0 267238384Sjkim .long 0 268238384Sjkim___ 269238384Sjkim 270238384Sjkim# This is private block function, which uses tailored calling 271238384Sjkim# interface, namely upon entry SHA_CTX is pre-loaded to given 272238384Sjkim# registers and counter register contains amount of chunks to 273238384Sjkim# digest... 274238384Sjkim$code.=<<___; 275238384Sjkim.align 4 276238384SjkimLsha1_block_private: 277238384Sjkim___ 278238384Sjkim$code.=<<___; # load K_00_19 279238384Sjkim lis $K,0x5a82 280238384Sjkim ori $K,$K,0x7999 281238384Sjkim___ 282238384Sjkimfor($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 283238384Sjkim$code.=<<___; # load K_20_39 284238384Sjkim lis $K,0x6ed9 285238384Sjkim ori $K,$K,0xeba1 286238384Sjkim___ 287238384Sjkimfor(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 288238384Sjkim$code.=<<___; # load K_40_59 289238384Sjkim lis $K,0x8f1b 290238384Sjkim ori $K,$K,0xbcdc 291238384Sjkim___ 292238384Sjkimfor(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 293238384Sjkim$code.=<<___; # load K_60_79 294238384Sjkim lis $K,0xca62 295238384Sjkim ori $K,$K,0xc1d6 296238384Sjkim___ 297238384Sjkimfor(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 298238384Sjkim$code.=<<___; 299238384Sjkim add r16,r16,$E 300238384Sjkim add r17,r17,$T 301238384Sjkim add r18,r18,$A 302238384Sjkim add r19,r19,$B 303238384Sjkim add r20,r20,$C 304238384Sjkim stw r16,0($ctx) 305238384Sjkim mr $A,r16 306238384Sjkim stw r17,4($ctx) 307238384Sjkim mr $B,r17 308238384Sjkim stw r18,8($ctx) 309238384Sjkim mr $C,r18 310238384Sjkim stw r19,12($ctx) 311238384Sjkim mr $D,r19 312238384Sjkim stw r20,16($ctx) 313238384Sjkim mr $E,r20 314238384Sjkim addi $inp,$inp,`16*4` 315238384Sjkim bdnz- Lsha1_block_private 316238384Sjkim blr 317238384Sjkim .long 0 318238384Sjkim .byte 0,12,0x14,0,0,0,0,0 319238384Sjkim___ 320238384Sjkim$code.=<<___; 321238384Sjkim.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" 322238384Sjkim___ 323238384Sjkim 324238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem; 325238384Sjkimprint $code; 326238384Sjkimclose STDOUT; 327