1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim
10238384Sjkim# SHA1 block procedure for s390x.
11238384Sjkim
12238384Sjkim# April 2007.
13238384Sjkim#
14238384Sjkim# Performance is >30% better than gcc 3.3 generated code. But the real
15238384Sjkim# twist is that SHA1 hardware support is detected and utilized. In
16238384Sjkim# which case performance can reach further >4.5x for larger chunks.
17238384Sjkim
18238384Sjkim# January 2009.
19238384Sjkim#
20238384Sjkim# Optimize Xupdate for amount of memory references and reschedule
21238384Sjkim# instructions to favour dual-issue z10 pipeline. On z10 hardware is
22238384Sjkim# "only" ~2.3x faster than software.
23238384Sjkim
24238384Sjkim# November 2010.
25238384Sjkim#
26238384Sjkim# Adapt for -m31 build. If kernel supports what's called "highgprs"
27238384Sjkim# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
28238384Sjkim# instructions and achieve "64-bit" performance even in 31-bit legacy
29238384Sjkim# application context. The feature is not specific to any particular
30238384Sjkim# processor, as long as it's "z-CPU". Latter implies that the code
31238384Sjkim# remains z/Architecture specific.
32238384Sjkim
33238384Sjkim$kimdfunc=1;	# magic function code for kimd instruction
34238384Sjkim
35238384Sjkim$flavour = shift;
36238384Sjkim
37238384Sjkimif ($flavour =~ /3[12]/) {
38238384Sjkim	$SIZE_T=4;
39238384Sjkim	$g="";
40238384Sjkim} else {
41238384Sjkim	$SIZE_T=8;
42238384Sjkim	$g="g";
43238384Sjkim}
44238384Sjkim
45238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
46238384Sjkimopen STDOUT,">$output";
47238384Sjkim
48238384Sjkim$K_00_39="%r0"; $K=$K_00_39;
49238384Sjkim$K_40_79="%r1";
50238384Sjkim$ctx="%r2";	$prefetch="%r2";
51238384Sjkim$inp="%r3";
52238384Sjkim$len="%r4";
53238384Sjkim
54238384Sjkim$A="%r5";
55238384Sjkim$B="%r6";
56238384Sjkim$C="%r7";
57238384Sjkim$D="%r8";
58238384Sjkim$E="%r9";	@V=($A,$B,$C,$D,$E);
59238384Sjkim$t0="%r10";
60238384Sjkim$t1="%r11";
61238384Sjkim@X=("%r12","%r13","%r14");
62238384Sjkim$sp="%r15";
63238384Sjkim
64238384Sjkim$stdframe=16*$SIZE_T+4*8;
65238384Sjkim$frame=$stdframe+16*4;
66238384Sjkim
67238384Sjkimsub Xupdate {
68238384Sjkimmy $i=shift;
69238384Sjkim
70238384Sjkim$code.=<<___ if ($i==15);
71238384Sjkim	lg	$prefetch,$stdframe($sp)	### Xupdate(16) warm-up
72238384Sjkim	lr	$X[0],$X[2]
73238384Sjkim___
74238384Sjkimreturn if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
75238384Sjkim$code.=<<___ if ($i<16);
76238384Sjkim	lg	$X[0],`$i*4`($inp)	### Xload($i)
77238384Sjkim	rllg	$X[1],$X[0],32
78238384Sjkim___
79238384Sjkim$code.=<<___ if ($i>=16);
80238384Sjkim	xgr	$X[0],$prefetch		### Xupdate($i)
81238384Sjkim	lg	$prefetch,`$stdframe+4*(($i+2)%16)`($sp)
82238384Sjkim	xg	$X[0],`$stdframe+4*(($i+8)%16)`($sp)
83238384Sjkim	xgr	$X[0],$prefetch
84238384Sjkim	rll	$X[0],$X[0],1
85238384Sjkim	rllg	$X[1],$X[0],32
86238384Sjkim	rll	$X[1],$X[1],1
87238384Sjkim	rllg	$X[0],$X[1],32
88238384Sjkim	lr	$X[2],$X[1]		# feedback
89238384Sjkim___
90238384Sjkim$code.=<<___ if ($i<=70);
91238384Sjkim	stg	$X[0],`$stdframe+4*($i%16)`($sp)
92238384Sjkim___
93238384Sjkimunshift(@X,pop(@X));
94238384Sjkim}
95238384Sjkim
96238384Sjkimsub BODY_00_19 {
97238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
98238384Sjkimmy $xi=$X[1];
99238384Sjkim
100238384Sjkim	&Xupdate($i);
101238384Sjkim$code.=<<___;
102238384Sjkim	alr	$e,$K		### $i
103238384Sjkim	rll	$t1,$a,5
104238384Sjkim	lr	$t0,$d
105238384Sjkim	xr	$t0,$c
106238384Sjkim	alr	$e,$t1
107238384Sjkim	nr	$t0,$b
108238384Sjkim	alr	$e,$xi
109238384Sjkim	xr	$t0,$d
110238384Sjkim	rll	$b,$b,30
111238384Sjkim	alr	$e,$t0
112238384Sjkim___
113238384Sjkim}
114238384Sjkim
115238384Sjkimsub BODY_20_39 {
116238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
117238384Sjkimmy $xi=$X[1];
118238384Sjkim
119238384Sjkim	&Xupdate($i);
120238384Sjkim$code.=<<___;
121238384Sjkim	alr	$e,$K		### $i
122238384Sjkim	rll	$t1,$a,5
123238384Sjkim	lr	$t0,$b
124238384Sjkim	alr	$e,$t1
125238384Sjkim	xr	$t0,$c
126238384Sjkim	alr	$e,$xi
127238384Sjkim	xr	$t0,$d
128238384Sjkim	rll	$b,$b,30
129238384Sjkim	alr	$e,$t0
130238384Sjkim___
131238384Sjkim}
132238384Sjkim
133238384Sjkimsub BODY_40_59 {
134238384Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
135238384Sjkimmy $xi=$X[1];
136238384Sjkim
137238384Sjkim	&Xupdate($i);
138238384Sjkim$code.=<<___;
139238384Sjkim	alr	$e,$K		### $i
140238384Sjkim	rll	$t1,$a,5
141238384Sjkim	lr	$t0,$b
142238384Sjkim	alr	$e,$t1
143238384Sjkim	or	$t0,$c
144238384Sjkim	lr	$t1,$b
145238384Sjkim	nr	$t0,$d
146238384Sjkim	nr	$t1,$c
147238384Sjkim	alr	$e,$xi
148238384Sjkim	or	$t0,$t1
149238384Sjkim	rll	$b,$b,30
150238384Sjkim	alr	$e,$t0
151238384Sjkim___
152238384Sjkim}
153238384Sjkim
154238384Sjkim$code.=<<___;
155238384Sjkim.text
156238384Sjkim.align	64
157238384Sjkim.type	Ktable,\@object
158238384SjkimKtable: .long	0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
159238384Sjkim	.skip	48	#.long	0,0,0,0,0,0,0,0,0,0,0,0
160238384Sjkim.size	Ktable,.-Ktable
161238384Sjkim.globl	sha1_block_data_order
162238384Sjkim.type	sha1_block_data_order,\@function
163238384Sjkimsha1_block_data_order:
164238384Sjkim___
165238384Sjkim$code.=<<___ if ($kimdfunc);
166238384Sjkim	larl	%r1,OPENSSL_s390xcap_P
167238384Sjkim	lg	%r0,0(%r1)
168238384Sjkim	tmhl	%r0,0x4000	# check for message-security assist
169238384Sjkim	jz	.Lsoftware
170238384Sjkim	lghi	%r0,0
171238384Sjkim	la	%r1,`2*$SIZE_T`($sp)
172238384Sjkim	.long	0xb93e0002	# kimd %r0,%r2
173238384Sjkim	lg	%r0,`2*$SIZE_T`($sp)
174238384Sjkim	tmhh	%r0,`0x8000>>$kimdfunc`
175238384Sjkim	jz	.Lsoftware
176238384Sjkim	lghi	%r0,$kimdfunc
177238384Sjkim	lgr	%r1,$ctx
178238384Sjkim	lgr	%r2,$inp
179238384Sjkim	sllg	%r3,$len,6
180238384Sjkim	.long	0xb93e0002	# kimd %r0,%r2
181238384Sjkim	brc	1,.-4		# pay attention to "partial completion"
182238384Sjkim	br	%r14
183238384Sjkim.align	16
184238384Sjkim.Lsoftware:
185238384Sjkim___
186238384Sjkim$code.=<<___;
187238384Sjkim	lghi	%r1,-$frame
188238384Sjkim	st${g}	$ctx,`2*$SIZE_T`($sp)
189238384Sjkim	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
190238384Sjkim	lgr	%r0,$sp
191238384Sjkim	la	$sp,0(%r1,$sp)
192238384Sjkim	st${g}	%r0,0($sp)
193238384Sjkim
194238384Sjkim	larl	$t0,Ktable
195238384Sjkim	llgf	$A,0($ctx)
196238384Sjkim	llgf	$B,4($ctx)
197238384Sjkim	llgf	$C,8($ctx)
198238384Sjkim	llgf	$D,12($ctx)
199238384Sjkim	llgf	$E,16($ctx)
200238384Sjkim
201238384Sjkim	lg	$K_00_39,0($t0)
202238384Sjkim	lg	$K_40_79,8($t0)
203238384Sjkim
204238384Sjkim.Lloop:
205238384Sjkim	rllg	$K_00_39,$K_00_39,32
206238384Sjkim___
207238384Sjkimfor ($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
208238384Sjkim$code.=<<___;
209238384Sjkim	rllg	$K_00_39,$K_00_39,32
210238384Sjkim___
211238384Sjkimfor (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
212238384Sjkim$code.=<<___;	$K=$K_40_79;
213238384Sjkim	rllg	$K_40_79,$K_40_79,32
214238384Sjkim___
215238384Sjkimfor (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
216238384Sjkim$code.=<<___;
217238384Sjkim	rllg	$K_40_79,$K_40_79,32
218238384Sjkim___
219238384Sjkimfor (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
220238384Sjkim$code.=<<___;
221238384Sjkim
222238384Sjkim	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
223238384Sjkim	la	$inp,64($inp)
224238384Sjkim	al	$A,0($ctx)
225238384Sjkim	al	$B,4($ctx)
226238384Sjkim	al	$C,8($ctx)
227238384Sjkim	al	$D,12($ctx)
228238384Sjkim	al	$E,16($ctx)
229238384Sjkim	st	$A,0($ctx)
230238384Sjkim	st	$B,4($ctx)
231238384Sjkim	st	$C,8($ctx)
232238384Sjkim	st	$D,12($ctx)
233238384Sjkim	st	$E,16($ctx)
234238384Sjkim	brct${g} $len,.Lloop
235238384Sjkim
236238384Sjkim	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
237238384Sjkim	br	%r14
238238384Sjkim.size	sha1_block_data_order,.-sha1_block_data_order
239238384Sjkim.string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
240238384Sjkim.comm	OPENSSL_s390xcap_P,16,8
241238384Sjkim___
242238384Sjkim
243238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem;
244238384Sjkim
245238384Sjkimprint $code;
246238384Sjkimclose STDOUT;
247