1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim
10238384Sjkim# Performance improvement is not really impressive on pre-T1 CPU: +8%
11238384Sjkim# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
12238384Sjkim# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
13238384Sjkim# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
14238384Sjkim# X[16] vector is packed to 8 64-bit registers and as result nothing
15238384Sjkim# is spilled on stack. In addition input data is loaded in compact
16238384Sjkim# instruction sequence, thus minimizing the window when the code is
17238384Sjkim# subject to [inter-thread] cache-thrashing hazard. The goal is to
18238384Sjkim# ensure scalability on UltraSPARC T1, or rather to avoid decay when
19238384Sjkim# amount of active threads exceeds the number of physical cores.
20238384Sjkim
21238384Sjkim$bits=32;
22238384Sjkimfor (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
23238384Sjkimif ($bits==64)	{ $bias=2047; $frame=192; }
24238384Sjkimelse		{ $bias=0;    $frame=112; }
25238384Sjkim
26238384Sjkim$output=shift;
27238384Sjkimopen STDOUT,">$output";
28238384Sjkim
29238384Sjkim@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
30238384Sjkim$rot1m="%g2";
31238384Sjkim$tmp64="%g3";
32238384Sjkim$Xi="%g4";
33238384Sjkim$A="%l0";
34238384Sjkim$B="%l1";
35238384Sjkim$C="%l2";
36238384Sjkim$D="%l3";
37238384Sjkim$E="%l4";
38238384Sjkim@V=($A,$B,$C,$D,$E);
39238384Sjkim$K_00_19="%l5";
40238384Sjkim$K_20_39="%l6";
41238384Sjkim$K_40_59="%l7";
42238384Sjkim$K_60_79="%g5";
43238384Sjkim@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
44238384Sjkim
45238384Sjkim$ctx="%i0";
46238384Sjkim$inp="%i1";
47238384Sjkim$len="%i2";
48238384Sjkim$tmp0="%i3";
49238384Sjkim$tmp1="%i4";
50238384Sjkim$tmp2="%i5";
51238384Sjkim
52238384Sjkimsub BODY_00_15 {
53238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
54238384Sjkimmy $xi=($i&1)?@X[($i/2)%8]:$Xi;
55238384Sjkim
56238384Sjkim$code.=<<___;
57238384Sjkim	sll	$a,5,$tmp0		!! $i
58238384Sjkim	add	@K[$i/20],$e,$e
59238384Sjkim	srl	$a,27,$tmp1
60238384Sjkim	add	$tmp0,$e,$e
61238384Sjkim	and	$c,$b,$tmp0
62238384Sjkim	add	$tmp1,$e,$e
63238384Sjkim	sll	$b,30,$tmp2
64238384Sjkim	andn	$d,$b,$tmp1
65238384Sjkim	srl	$b,2,$b
66238384Sjkim	or	$tmp1,$tmp0,$tmp1
67238384Sjkim	or	$tmp2,$b,$b
68238384Sjkim	add	$xi,$e,$e
69238384Sjkim___
70238384Sjkimif ($i&1 && $i<15) {
71238384Sjkim	$code.=
72238384Sjkim	"	srlx	@X[(($i+1)/2)%8],32,$Xi\n";
73238384Sjkim}
74238384Sjkim$code.=<<___;
75238384Sjkim	add	$tmp1,$e,$e
76238384Sjkim___
77238384Sjkim}
78238384Sjkim
79238384Sjkimsub Xupdate {
80238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
81238384Sjkimmy $j=$i/2;
82238384Sjkim
83238384Sjkimif ($i&1) {
84238384Sjkim$code.=<<___;
85238384Sjkim	sll	$a,5,$tmp0		!! $i
86238384Sjkim	add	@K[$i/20],$e,$e
87238384Sjkim	srl	$a,27,$tmp1
88238384Sjkim___
89238384Sjkim} else {
90238384Sjkim$code.=<<___;
91238384Sjkim	sllx	@X[($j+6)%8],32,$Xi	! Xupdate($i)
92238384Sjkim	xor	@X[($j+1)%8],@X[$j%8],@X[$j%8]
93238384Sjkim	srlx	@X[($j+7)%8],32,$tmp1
94238384Sjkim	xor	@X[($j+4)%8],@X[$j%8],@X[$j%8]
95238384Sjkim	sll	$a,5,$tmp0		!! $i
96238384Sjkim	or	$tmp1,$Xi,$Xi
97238384Sjkim	add	@K[$i/20],$e,$e		!!
98238384Sjkim	xor	$Xi,@X[$j%8],@X[$j%8]
99238384Sjkim	srlx	@X[$j%8],31,$Xi
100238384Sjkim	add	@X[$j%8],@X[$j%8],@X[$j%8]
101238384Sjkim	and	$Xi,$rot1m,$Xi
102238384Sjkim	andn	@X[$j%8],$rot1m,@X[$j%8]
103238384Sjkim	srl	$a,27,$tmp1		!!
104238384Sjkim	or	$Xi,@X[$j%8],@X[$j%8]
105238384Sjkim___
106238384Sjkim}
107238384Sjkim}
108238384Sjkim
109238384Sjkimsub BODY_16_19 {
110238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
111238384Sjkim
112238384Sjkim	&Xupdate(@_);
113238384Sjkim    if ($i&1) {
114238384Sjkim	$xi=@X[($i/2)%8];
115238384Sjkim    } else {
116238384Sjkim	$xi=$Xi;
117238384Sjkim	$code.="\tsrlx	@X[($i/2)%8],32,$xi\n";
118238384Sjkim    }
119238384Sjkim$code.=<<___;
120238384Sjkim	add	$tmp0,$e,$e		!!
121238384Sjkim	and	$c,$b,$tmp0
122238384Sjkim	add	$tmp1,$e,$e
123238384Sjkim	sll	$b,30,$tmp2
124238384Sjkim	add	$xi,$e,$e
125238384Sjkim	andn	$d,$b,$tmp1
126238384Sjkim	srl	$b,2,$b
127238384Sjkim	or	$tmp1,$tmp0,$tmp1
128238384Sjkim	or	$tmp2,$b,$b
129238384Sjkim	add	$tmp1,$e,$e
130238384Sjkim___
131238384Sjkim}
132238384Sjkim
133238384Sjkimsub BODY_20_39 {
134238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
135238384Sjkimmy $xi;
136238384Sjkim	&Xupdate(@_);
137238384Sjkim    if ($i&1) {
138238384Sjkim	$xi=@X[($i/2)%8];
139238384Sjkim    } else {
140238384Sjkim	$xi=$Xi;
141238384Sjkim	$code.="\tsrlx	@X[($i/2)%8],32,$xi\n";
142238384Sjkim    }
143238384Sjkim$code.=<<___;
144238384Sjkim	add	$tmp0,$e,$e		!!
145238384Sjkim	xor	$c,$b,$tmp0
146238384Sjkim	add	$tmp1,$e,$e
147238384Sjkim	sll	$b,30,$tmp2
148238384Sjkim	xor	$d,$tmp0,$tmp1
149238384Sjkim	srl	$b,2,$b
150238384Sjkim	add	$tmp1,$e,$e
151238384Sjkim	or	$tmp2,$b,$b
152238384Sjkim	add	$xi,$e,$e
153238384Sjkim___
154238384Sjkim}
155238384Sjkim
156238384Sjkimsub BODY_40_59 {
157238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
158238384Sjkimmy $xi;
159238384Sjkim	&Xupdate(@_);
160238384Sjkim    if ($i&1) {
161238384Sjkim	$xi=@X[($i/2)%8];
162238384Sjkim    } else {
163238384Sjkim	$xi=$Xi;
164238384Sjkim	$code.="\tsrlx	@X[($i/2)%8],32,$xi\n";
165238384Sjkim    }
166238384Sjkim$code.=<<___;
167238384Sjkim	add	$tmp0,$e,$e		!!
168238384Sjkim	and	$c,$b,$tmp0
169238384Sjkim	add	$tmp1,$e,$e
170238384Sjkim	sll	$b,30,$tmp2
171238384Sjkim	or	$c,$b,$tmp1
172238384Sjkim	srl	$b,2,$b
173238384Sjkim	and	$d,$tmp1,$tmp1
174238384Sjkim	add	$xi,$e,$e
175238384Sjkim	or	$tmp1,$tmp0,$tmp1
176238384Sjkim	or	$tmp2,$b,$b
177238384Sjkim	add	$tmp1,$e,$e
178238384Sjkim___
179238384Sjkim}
180238384Sjkim
181238384Sjkim$code.=<<___ if ($bits==64);
182238384Sjkim.register	%g2,#scratch
183238384Sjkim.register	%g3,#scratch
184238384Sjkim___
185238384Sjkim$code.=<<___;
186238384Sjkim.section	".text",#alloc,#execinstr
187238384Sjkim
188238384Sjkim.align	32
189238384Sjkim.globl	sha1_block_data_order
190238384Sjkimsha1_block_data_order:
191238384Sjkim	save	%sp,-$frame,%sp
192238384Sjkim	sllx	$len,6,$len
193238384Sjkim	add	$inp,$len,$len
194238384Sjkim
195238384Sjkim	or	%g0,1,$rot1m
196238384Sjkim	sllx	$rot1m,32,$rot1m
197238384Sjkim	or	$rot1m,1,$rot1m
198238384Sjkim
199238384Sjkim	ld	[$ctx+0],$A
200238384Sjkim	ld	[$ctx+4],$B
201238384Sjkim	ld	[$ctx+8],$C
202238384Sjkim	ld	[$ctx+12],$D
203238384Sjkim	ld	[$ctx+16],$E
204238384Sjkim	andn	$inp,7,$tmp0
205238384Sjkim
206238384Sjkim	sethi	%hi(0x5a827999),$K_00_19
207238384Sjkim	or	$K_00_19,%lo(0x5a827999),$K_00_19
208238384Sjkim	sethi	%hi(0x6ed9eba1),$K_20_39
209238384Sjkim	or	$K_20_39,%lo(0x6ed9eba1),$K_20_39
210238384Sjkim	sethi	%hi(0x8f1bbcdc),$K_40_59
211238384Sjkim	or	$K_40_59,%lo(0x8f1bbcdc),$K_40_59
212238384Sjkim	sethi	%hi(0xca62c1d6),$K_60_79
213238384Sjkim	or	$K_60_79,%lo(0xca62c1d6),$K_60_79
214238384Sjkim
215238384Sjkim.Lloop:
216238384Sjkim	ldx	[$tmp0+0],@X[0]
217238384Sjkim	ldx	[$tmp0+16],@X[2]
218238384Sjkim	ldx	[$tmp0+32],@X[4]
219238384Sjkim	ldx	[$tmp0+48],@X[6]
220238384Sjkim	and	$inp,7,$tmp1
221238384Sjkim	ldx	[$tmp0+8],@X[1]
222238384Sjkim	sll	$tmp1,3,$tmp1
223238384Sjkim	ldx	[$tmp0+24],@X[3]
224238384Sjkim	subcc	%g0,$tmp1,$tmp2	! should be 64-$tmp1, but -$tmp1 works too
225238384Sjkim	ldx	[$tmp0+40],@X[5]
226238384Sjkim	bz,pt	%icc,.Laligned
227238384Sjkim	ldx	[$tmp0+56],@X[7]
228238384Sjkim
229238384Sjkim	sllx	@X[0],$tmp1,@X[0]
230238384Sjkim	ldx	[$tmp0+64],$tmp64
231238384Sjkim___
232238384Sjkimfor($i=0;$i<7;$i++)
233238384Sjkim{   $code.=<<___;
234238384Sjkim	srlx	@X[$i+1],$tmp2,$Xi
235238384Sjkim	sllx	@X[$i+1],$tmp1,@X[$i+1]
236238384Sjkim	or	$Xi,@X[$i],@X[$i]
237238384Sjkim___
238238384Sjkim}
239238384Sjkim$code.=<<___;
240238384Sjkim	srlx	$tmp64,$tmp2,$tmp64
241238384Sjkim	or	$tmp64,@X[7],@X[7]
242238384Sjkim.Laligned:
243238384Sjkim	srlx	@X[0],32,$Xi
244238384Sjkim___
245238384Sjkimfor ($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
246238384Sjkimfor (;$i<20;$i++)	{ &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
247238384Sjkimfor (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
248238384Sjkimfor (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
249238384Sjkimfor (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
250238384Sjkim$code.=<<___;
251238384Sjkim
252238384Sjkim	ld	[$ctx+0],@X[0]
253238384Sjkim	ld	[$ctx+4],@X[1]
254238384Sjkim	ld	[$ctx+8],@X[2]
255238384Sjkim	ld	[$ctx+12],@X[3]
256238384Sjkim	add	$inp,64,$inp
257238384Sjkim	ld	[$ctx+16],@X[4]
258238384Sjkim	cmp	$inp,$len
259238384Sjkim
260238384Sjkim	add	$A,@X[0],$A
261238384Sjkim	st	$A,[$ctx+0]
262238384Sjkim	add	$B,@X[1],$B
263238384Sjkim	st	$B,[$ctx+4]
264238384Sjkim	add	$C,@X[2],$C
265238384Sjkim	st	$C,[$ctx+8]
266238384Sjkim	add	$D,@X[3],$D
267238384Sjkim	st	$D,[$ctx+12]
268238384Sjkim	add	$E,@X[4],$E
269238384Sjkim	st	$E,[$ctx+16]
270238384Sjkim
271238384Sjkim	bne	`$bits==64?"%xcc":"%icc"`,.Lloop
272238384Sjkim	andn	$inp,7,$tmp0
273238384Sjkim
274238384Sjkim	ret
275238384Sjkim	restore
276238384Sjkim.type	sha1_block_data_order,#function
277238384Sjkim.size	sha1_block_data_order,(.-sha1_block_data_order)
278238384Sjkim.asciz	"SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
279238384Sjkim.align	4
280238384Sjkim___
281238384Sjkim
282238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem;
283238384Sjkimprint $code;
284238384Sjkimclose STDOUT;
285