1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# Performance improvement is not really impressive on pre-T1 CPU: +8% 11238384Sjkim# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it 12238384Sjkim# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and 13238384Sjkim# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick. 14238384Sjkim# X[16] vector is packed to 8 64-bit registers and as result nothing 15238384Sjkim# is spilled on stack. In addition input data is loaded in compact 16238384Sjkim# instruction sequence, thus minimizing the window when the code is 17238384Sjkim# subject to [inter-thread] cache-thrashing hazard. The goal is to 18238384Sjkim# ensure scalability on UltraSPARC T1, or rather to avoid decay when 19238384Sjkim# amount of active threads exceeds the number of physical cores. 20238384Sjkim 21238384Sjkim$bits=32; 22238384Sjkimfor (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } 23238384Sjkimif ($bits==64) { $bias=2047; $frame=192; } 24238384Sjkimelse { $bias=0; $frame=112; } 25238384Sjkim 26238384Sjkim$output=shift; 27238384Sjkimopen STDOUT,">$output"; 28238384Sjkim 29238384Sjkim@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); 30238384Sjkim$rot1m="%g2"; 31238384Sjkim$tmp64="%g3"; 32238384Sjkim$Xi="%g4"; 33238384Sjkim$A="%l0"; 34238384Sjkim$B="%l1"; 35238384Sjkim$C="%l2"; 36238384Sjkim$D="%l3"; 37238384Sjkim$E="%l4"; 38238384Sjkim@V=($A,$B,$C,$D,$E); 39238384Sjkim$K_00_19="%l5"; 40238384Sjkim$K_20_39="%l6"; 41238384Sjkim$K_40_59="%l7"; 42238384Sjkim$K_60_79="%g5"; 43238384Sjkim@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79); 44238384Sjkim 45238384Sjkim$ctx="%i0"; 46238384Sjkim$inp="%i1"; 47238384Sjkim$len="%i2"; 48238384Sjkim$tmp0="%i3"; 49238384Sjkim$tmp1="%i4"; 50238384Sjkim$tmp2="%i5"; 51238384Sjkim 52238384Sjkimsub BODY_00_15 { 53238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 54238384Sjkimmy $xi=($i&1)?@X[($i/2)%8]:$Xi; 55238384Sjkim 56238384Sjkim$code.=<<___; 57238384Sjkim sll $a,5,$tmp0 !! $i 58238384Sjkim add @K[$i/20],$e,$e 59238384Sjkim srl $a,27,$tmp1 60238384Sjkim add $tmp0,$e,$e 61238384Sjkim and $c,$b,$tmp0 62238384Sjkim add $tmp1,$e,$e 63238384Sjkim sll $b,30,$tmp2 64238384Sjkim andn $d,$b,$tmp1 65238384Sjkim srl $b,2,$b 66238384Sjkim or $tmp1,$tmp0,$tmp1 67238384Sjkim or $tmp2,$b,$b 68238384Sjkim add $xi,$e,$e 69238384Sjkim___ 70238384Sjkimif ($i&1 && $i<15) { 71238384Sjkim $code.= 72238384Sjkim " srlx @X[(($i+1)/2)%8],32,$Xi\n"; 73238384Sjkim} 74238384Sjkim$code.=<<___; 75238384Sjkim add $tmp1,$e,$e 76238384Sjkim___ 77238384Sjkim} 78238384Sjkim 79238384Sjkimsub Xupdate { 80238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 81238384Sjkimmy $j=$i/2; 82238384Sjkim 83238384Sjkimif ($i&1) { 84238384Sjkim$code.=<<___; 85238384Sjkim sll $a,5,$tmp0 !! $i 86238384Sjkim add @K[$i/20],$e,$e 87238384Sjkim srl $a,27,$tmp1 88238384Sjkim___ 89238384Sjkim} else { 90238384Sjkim$code.=<<___; 91238384Sjkim sllx @X[($j+6)%8],32,$Xi ! Xupdate($i) 92238384Sjkim xor @X[($j+1)%8],@X[$j%8],@X[$j%8] 93238384Sjkim srlx @X[($j+7)%8],32,$tmp1 94238384Sjkim xor @X[($j+4)%8],@X[$j%8],@X[$j%8] 95238384Sjkim sll $a,5,$tmp0 !! $i 96238384Sjkim or $tmp1,$Xi,$Xi 97238384Sjkim add @K[$i/20],$e,$e !! 98238384Sjkim xor $Xi,@X[$j%8],@X[$j%8] 99238384Sjkim srlx @X[$j%8],31,$Xi 100238384Sjkim add @X[$j%8],@X[$j%8],@X[$j%8] 101238384Sjkim and $Xi,$rot1m,$Xi 102238384Sjkim andn @X[$j%8],$rot1m,@X[$j%8] 103238384Sjkim srl $a,27,$tmp1 !! 104238384Sjkim or $Xi,@X[$j%8],@X[$j%8] 105238384Sjkim___ 106238384Sjkim} 107238384Sjkim} 108238384Sjkim 109238384Sjkimsub BODY_16_19 { 110238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 111238384Sjkim 112238384Sjkim &Xupdate(@_); 113238384Sjkim if ($i&1) { 114238384Sjkim $xi=@X[($i/2)%8]; 115238384Sjkim } else { 116238384Sjkim $xi=$Xi; 117238384Sjkim $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; 118238384Sjkim } 119238384Sjkim$code.=<<___; 120238384Sjkim add $tmp0,$e,$e !! 121238384Sjkim and $c,$b,$tmp0 122238384Sjkim add $tmp1,$e,$e 123238384Sjkim sll $b,30,$tmp2 124238384Sjkim add $xi,$e,$e 125238384Sjkim andn $d,$b,$tmp1 126238384Sjkim srl $b,2,$b 127238384Sjkim or $tmp1,$tmp0,$tmp1 128238384Sjkim or $tmp2,$b,$b 129238384Sjkim add $tmp1,$e,$e 130238384Sjkim___ 131238384Sjkim} 132238384Sjkim 133238384Sjkimsub BODY_20_39 { 134238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 135238384Sjkimmy $xi; 136238384Sjkim &Xupdate(@_); 137238384Sjkim if ($i&1) { 138238384Sjkim $xi=@X[($i/2)%8]; 139238384Sjkim } else { 140238384Sjkim $xi=$Xi; 141238384Sjkim $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; 142238384Sjkim } 143238384Sjkim$code.=<<___; 144238384Sjkim add $tmp0,$e,$e !! 145238384Sjkim xor $c,$b,$tmp0 146238384Sjkim add $tmp1,$e,$e 147238384Sjkim sll $b,30,$tmp2 148238384Sjkim xor $d,$tmp0,$tmp1 149238384Sjkim srl $b,2,$b 150238384Sjkim add $tmp1,$e,$e 151238384Sjkim or $tmp2,$b,$b 152238384Sjkim add $xi,$e,$e 153238384Sjkim___ 154238384Sjkim} 155238384Sjkim 156238384Sjkimsub BODY_40_59 { 157238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 158238384Sjkimmy $xi; 159238384Sjkim &Xupdate(@_); 160238384Sjkim if ($i&1) { 161238384Sjkim $xi=@X[($i/2)%8]; 162238384Sjkim } else { 163238384Sjkim $xi=$Xi; 164238384Sjkim $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; 165238384Sjkim } 166238384Sjkim$code.=<<___; 167238384Sjkim add $tmp0,$e,$e !! 168238384Sjkim and $c,$b,$tmp0 169238384Sjkim add $tmp1,$e,$e 170238384Sjkim sll $b,30,$tmp2 171238384Sjkim or $c,$b,$tmp1 172238384Sjkim srl $b,2,$b 173238384Sjkim and $d,$tmp1,$tmp1 174238384Sjkim add $xi,$e,$e 175238384Sjkim or $tmp1,$tmp0,$tmp1 176238384Sjkim or $tmp2,$b,$b 177238384Sjkim add $tmp1,$e,$e 178238384Sjkim___ 179238384Sjkim} 180238384Sjkim 181238384Sjkim$code.=<<___ if ($bits==64); 182238384Sjkim.register %g2,#scratch 183238384Sjkim.register %g3,#scratch 184238384Sjkim___ 185238384Sjkim$code.=<<___; 186238384Sjkim.section ".text",#alloc,#execinstr 187238384Sjkim 188238384Sjkim.align 32 189238384Sjkim.globl sha1_block_data_order 190238384Sjkimsha1_block_data_order: 191238384Sjkim save %sp,-$frame,%sp 192238384Sjkim sllx $len,6,$len 193238384Sjkim add $inp,$len,$len 194238384Sjkim 195238384Sjkim or %g0,1,$rot1m 196238384Sjkim sllx $rot1m,32,$rot1m 197238384Sjkim or $rot1m,1,$rot1m 198238384Sjkim 199238384Sjkim ld [$ctx+0],$A 200238384Sjkim ld [$ctx+4],$B 201238384Sjkim ld [$ctx+8],$C 202238384Sjkim ld [$ctx+12],$D 203238384Sjkim ld [$ctx+16],$E 204238384Sjkim andn $inp,7,$tmp0 205238384Sjkim 206238384Sjkim sethi %hi(0x5a827999),$K_00_19 207238384Sjkim or $K_00_19,%lo(0x5a827999),$K_00_19 208238384Sjkim sethi %hi(0x6ed9eba1),$K_20_39 209238384Sjkim or $K_20_39,%lo(0x6ed9eba1),$K_20_39 210238384Sjkim sethi %hi(0x8f1bbcdc),$K_40_59 211238384Sjkim or $K_40_59,%lo(0x8f1bbcdc),$K_40_59 212238384Sjkim sethi %hi(0xca62c1d6),$K_60_79 213238384Sjkim or $K_60_79,%lo(0xca62c1d6),$K_60_79 214238384Sjkim 215238384Sjkim.Lloop: 216238384Sjkim ldx [$tmp0+0],@X[0] 217238384Sjkim ldx [$tmp0+16],@X[2] 218238384Sjkim ldx [$tmp0+32],@X[4] 219238384Sjkim ldx [$tmp0+48],@X[6] 220238384Sjkim and $inp,7,$tmp1 221238384Sjkim ldx [$tmp0+8],@X[1] 222238384Sjkim sll $tmp1,3,$tmp1 223238384Sjkim ldx [$tmp0+24],@X[3] 224238384Sjkim subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too 225238384Sjkim ldx [$tmp0+40],@X[5] 226238384Sjkim bz,pt %icc,.Laligned 227238384Sjkim ldx [$tmp0+56],@X[7] 228238384Sjkim 229238384Sjkim sllx @X[0],$tmp1,@X[0] 230238384Sjkim ldx [$tmp0+64],$tmp64 231238384Sjkim___ 232238384Sjkimfor($i=0;$i<7;$i++) 233238384Sjkim{ $code.=<<___; 234238384Sjkim srlx @X[$i+1],$tmp2,$Xi 235238384Sjkim sllx @X[$i+1],$tmp1,@X[$i+1] 236238384Sjkim or $Xi,@X[$i],@X[$i] 237238384Sjkim___ 238238384Sjkim} 239238384Sjkim$code.=<<___; 240238384Sjkim srlx $tmp64,$tmp2,$tmp64 241238384Sjkim or $tmp64,@X[7],@X[7] 242238384Sjkim.Laligned: 243238384Sjkim srlx @X[0],32,$Xi 244238384Sjkim___ 245238384Sjkimfor ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 246238384Sjkimfor (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } 247238384Sjkimfor (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 248238384Sjkimfor (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 249238384Sjkimfor (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 250238384Sjkim$code.=<<___; 251238384Sjkim 252238384Sjkim ld [$ctx+0],@X[0] 253238384Sjkim ld [$ctx+4],@X[1] 254238384Sjkim ld [$ctx+8],@X[2] 255238384Sjkim ld [$ctx+12],@X[3] 256238384Sjkim add $inp,64,$inp 257238384Sjkim ld [$ctx+16],@X[4] 258238384Sjkim cmp $inp,$len 259238384Sjkim 260238384Sjkim add $A,@X[0],$A 261238384Sjkim st $A,[$ctx+0] 262238384Sjkim add $B,@X[1],$B 263238384Sjkim st $B,[$ctx+4] 264238384Sjkim add $C,@X[2],$C 265238384Sjkim st $C,[$ctx+8] 266238384Sjkim add $D,@X[3],$D 267238384Sjkim st $D,[$ctx+12] 268238384Sjkim add $E,@X[4],$E 269238384Sjkim st $E,[$ctx+16] 270238384Sjkim 271238384Sjkim bne `$bits==64?"%xcc":"%icc"`,.Lloop 272238384Sjkim andn $inp,7,$tmp0 273238384Sjkim 274238384Sjkim ret 275238384Sjkim restore 276238384Sjkim.type sha1_block_data_order,#function 277238384Sjkim.size sha1_block_data_order,(.-sha1_block_data_order) 278238384Sjkim.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 279238384Sjkim.align 4 280238384Sjkim___ 281238384Sjkim 282238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem; 283238384Sjkimprint $code; 284238384Sjkimclose STDOUT; 285