1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim
10238384Sjkim# April 2007.
11238384Sjkim#
12238384Sjkim# Performance improvement over vanilla C code varies from 85% to 45%
13238384Sjkim# depending on key length and benchmark. Unfortunately in this context
14238384Sjkim# these are not very impressive results [for code that utilizes "wide"
15238384Sjkim# 64x64=128-bit multiplication, which is not commonly available to C
16238384Sjkim# programmers], at least hand-coded bn_asm.c replacement is known to
17238384Sjkim# provide 30-40% better results for longest keys. Well, on a second
18238384Sjkim# thought it's not very surprising, because z-CPUs are single-issue
19238384Sjkim# and _strictly_ in-order execution, while bn_mul_mont is more or less
20238384Sjkim# dependent on CPU ability to pipe-line instructions and have several
21238384Sjkim# of them "in-flight" at the same time. I mean while other methods,
22238384Sjkim# for example Karatsuba, aim to minimize amount of multiplications at
23238384Sjkim# the cost of other operations increase, bn_mul_mont aim to neatly
24238384Sjkim# "overlap" multiplications and the other operations [and on most
25238384Sjkim# platforms even minimize the amount of the other operations, in
26238384Sjkim# particular references to memory]. But it's possible to improve this
27238384Sjkim# module performance by implementing dedicated squaring code-path and
28238384Sjkim# possibly by unrolling loops...
29238384Sjkim
30238384Sjkim# January 2009.
31238384Sjkim#
32238384Sjkim# Reschedule to minimize/avoid Address Generation Interlock hazard,
33238384Sjkim# make inner loops counter-based.
34238384Sjkim
35238384Sjkim# November 2010.
36238384Sjkim#
37238384Sjkim# Adapt for -m31 build. If kernel supports what's called "highgprs"
38238384Sjkim# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
39238384Sjkim# instructions and achieve "64-bit" performance even in 31-bit legacy
40238384Sjkim# application context. The feature is not specific to any particular
41238384Sjkim# processor, as long as it's "z-CPU". Latter implies that the code
42238384Sjkim# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
43238384Sjkim# is achieved by swapping words after 64-bit loads, follow _dswap-s.
44238384Sjkim# On z990 it was measured to perform 2.6-2.2 times better than
45238384Sjkim# compiler-generated code, less for longer keys...
46238384Sjkim
47238384Sjkim$flavour = shift;
48238384Sjkim
49238384Sjkimif ($flavour =~ /3[12]/) {
50238384Sjkim	$SIZE_T=4;
51238384Sjkim	$g="";
52238384Sjkim} else {
53238384Sjkim	$SIZE_T=8;
54238384Sjkim	$g="g";
55238384Sjkim}
56238384Sjkim
57238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
58238384Sjkimopen STDOUT,">$output";
59238384Sjkim
60238384Sjkim$stdframe=16*$SIZE_T+4*8;
61238384Sjkim
62238384Sjkim$mn0="%r0";
63238384Sjkim$num="%r1";
64238384Sjkim
65238384Sjkim# int bn_mul_mont(
66238384Sjkim$rp="%r2";		# BN_ULONG *rp,
67238384Sjkim$ap="%r3";		# const BN_ULONG *ap,
68238384Sjkim$bp="%r4";		# const BN_ULONG *bp,
69238384Sjkim$np="%r5";		# const BN_ULONG *np,
70238384Sjkim$n0="%r6";		# const BN_ULONG *n0,
71238384Sjkim#$num="160(%r15)"	# int num);
72238384Sjkim
73238384Sjkim$bi="%r2";	# zaps rp
74238384Sjkim$j="%r7";
75238384Sjkim
76238384Sjkim$ahi="%r8";
77238384Sjkim$alo="%r9";
78238384Sjkim$nhi="%r10";
79238384Sjkim$nlo="%r11";
80238384Sjkim$AHI="%r12";
81238384Sjkim$NHI="%r13";
82238384Sjkim$count="%r14";
83238384Sjkim$sp="%r15";
84238384Sjkim
85238384Sjkim$code.=<<___;
86238384Sjkim.text
87238384Sjkim.globl	bn_mul_mont
88238384Sjkim.type	bn_mul_mont,\@function
89238384Sjkimbn_mul_mont:
90238384Sjkim	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
91238384Sjkim	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
92238384Sjkim	la	$bp,0($num,$bp)
93238384Sjkim
94238384Sjkim	st${g}	%r2,2*$SIZE_T($sp)
95238384Sjkim
96238384Sjkim	cghi	$num,16		#
97238384Sjkim	lghi	%r2,0		#
98238384Sjkim	blr	%r14		# if($num<16) return 0;
99238384Sjkim___
100238384Sjkim$code.=<<___ if ($flavour =~ /3[12]/);
101238384Sjkim	tmll	$num,4
102238384Sjkim	bnzr	%r14		# if ($num&1) return 0;
103238384Sjkim___
104238384Sjkim$code.=<<___ if ($flavour !~ /3[12]/);
105238384Sjkim	cghi	$num,96		#
106238384Sjkim	bhr	%r14		# if($num>96) return 0;
107238384Sjkim___
108238384Sjkim$code.=<<___;
109238384Sjkim	stm${g}	%r3,%r15,3*$SIZE_T($sp)
110238384Sjkim
111238384Sjkim	lghi	$rp,-$stdframe-8	# leave room for carry bit
112238384Sjkim	lcgr	$j,$num		# -$num
113238384Sjkim	lgr	%r0,$sp
114238384Sjkim	la	$rp,0($rp,$sp)
115238384Sjkim	la	$sp,0($j,$rp)	# alloca
116238384Sjkim	st${g}	%r0,0($sp)	# back chain
117238384Sjkim
118238384Sjkim	sra	$num,3		# restore $num
119238384Sjkim	la	$bp,0($j,$bp)	# restore $bp
120238384Sjkim	ahi	$num,-1		# adjust $num for inner loop
121238384Sjkim	lg	$n0,0($n0)	# pull n0
122238384Sjkim	_dswap	$n0
123238384Sjkim
124238384Sjkim	lg	$bi,0($bp)
125238384Sjkim	_dswap	$bi
126238384Sjkim	lg	$alo,0($ap)
127238384Sjkim	_dswap	$alo
128238384Sjkim	mlgr	$ahi,$bi	# ap[0]*bp[0]
129238384Sjkim	lgr	$AHI,$ahi
130238384Sjkim
131238384Sjkim	lgr	$mn0,$alo	# "tp[0]"*n0
132238384Sjkim	msgr	$mn0,$n0
133238384Sjkim
134238384Sjkim	lg	$nlo,0($np)	#
135238384Sjkim	_dswap	$nlo
136238384Sjkim	mlgr	$nhi,$mn0	# np[0]*m1
137238384Sjkim	algr	$nlo,$alo	# +="tp[0]"
138238384Sjkim	lghi	$NHI,0
139238384Sjkim	alcgr	$NHI,$nhi
140238384Sjkim
141238384Sjkim	la	$j,8(%r0)	# j=1
142238384Sjkim	lr	$count,$num
143238384Sjkim
144238384Sjkim.align	16
145238384Sjkim.L1st:
146238384Sjkim	lg	$alo,0($j,$ap)
147238384Sjkim	_dswap	$alo
148238384Sjkim	mlgr	$ahi,$bi	# ap[j]*bp[0]
149238384Sjkim	algr	$alo,$AHI
150238384Sjkim	lghi	$AHI,0
151238384Sjkim	alcgr	$AHI,$ahi
152238384Sjkim
153238384Sjkim	lg	$nlo,0($j,$np)
154238384Sjkim	_dswap	$nlo
155238384Sjkim	mlgr	$nhi,$mn0	# np[j]*m1
156238384Sjkim	algr	$nlo,$NHI
157238384Sjkim	lghi	$NHI,0
158238384Sjkim	alcgr	$nhi,$NHI	# +="tp[j]"
159238384Sjkim	algr	$nlo,$alo
160238384Sjkim	alcgr	$NHI,$nhi
161238384Sjkim
162238384Sjkim	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
163238384Sjkim	la	$j,8($j)	# j++
164238384Sjkim	brct	$count,.L1st
165238384Sjkim
166238384Sjkim	algr	$NHI,$AHI
167238384Sjkim	lghi	$AHI,0
168238384Sjkim	alcgr	$AHI,$AHI	# upmost overflow bit
169238384Sjkim	stg	$NHI,$stdframe-8($j,$sp)
170238384Sjkim	stg	$AHI,$stdframe($j,$sp)
171238384Sjkim	la	$bp,8($bp)	# bp++
172238384Sjkim
173238384Sjkim.Louter:
174238384Sjkim	lg	$bi,0($bp)	# bp[i]
175238384Sjkim	_dswap	$bi
176238384Sjkim	lg	$alo,0($ap)
177238384Sjkim	_dswap	$alo
178238384Sjkim	mlgr	$ahi,$bi	# ap[0]*bp[i]
179238384Sjkim	alg	$alo,$stdframe($sp)	# +=tp[0]
180238384Sjkim	lghi	$AHI,0
181238384Sjkim	alcgr	$AHI,$ahi
182238384Sjkim
183238384Sjkim	lgr	$mn0,$alo
184238384Sjkim	msgr	$mn0,$n0	# tp[0]*n0
185238384Sjkim
186238384Sjkim	lg	$nlo,0($np)	# np[0]
187238384Sjkim	_dswap	$nlo
188238384Sjkim	mlgr	$nhi,$mn0	# np[0]*m1
189238384Sjkim	algr	$nlo,$alo	# +="tp[0]"
190238384Sjkim	lghi	$NHI,0
191238384Sjkim	alcgr	$NHI,$nhi
192238384Sjkim
193238384Sjkim	la	$j,8(%r0)	# j=1
194238384Sjkim	lr	$count,$num
195238384Sjkim
196238384Sjkim.align	16
197238384Sjkim.Linner:
198238384Sjkim	lg	$alo,0($j,$ap)
199238384Sjkim	_dswap	$alo
200238384Sjkim	mlgr	$ahi,$bi	# ap[j]*bp[i]
201238384Sjkim	algr	$alo,$AHI
202238384Sjkim	lghi	$AHI,0
203238384Sjkim	alcgr	$ahi,$AHI
204238384Sjkim	alg	$alo,$stdframe($j,$sp)# +=tp[j]
205238384Sjkim	alcgr	$AHI,$ahi
206238384Sjkim
207238384Sjkim	lg	$nlo,0($j,$np)
208238384Sjkim	_dswap	$nlo
209238384Sjkim	mlgr	$nhi,$mn0	# np[j]*m1
210238384Sjkim	algr	$nlo,$NHI
211238384Sjkim	lghi	$NHI,0
212238384Sjkim	alcgr	$nhi,$NHI
213238384Sjkim	algr	$nlo,$alo	# +="tp[j]"
214238384Sjkim	alcgr	$NHI,$nhi
215238384Sjkim
216238384Sjkim	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
217238384Sjkim	la	$j,8($j)	# j++
218238384Sjkim	brct	$count,.Linner
219238384Sjkim
220238384Sjkim	algr	$NHI,$AHI
221238384Sjkim	lghi	$AHI,0
222238384Sjkim	alcgr	$AHI,$AHI
223238384Sjkim	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
224238384Sjkim	lghi	$ahi,0
225238384Sjkim	alcgr	$AHI,$ahi	# new upmost overflow bit
226238384Sjkim	stg	$NHI,$stdframe-8($j,$sp)
227238384Sjkim	stg	$AHI,$stdframe($j,$sp)
228238384Sjkim
229238384Sjkim	la	$bp,8($bp)	# bp++
230238384Sjkim	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
231238384Sjkim	jne	.Louter
232238384Sjkim
233238384Sjkim	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
234238384Sjkim	la	$ap,$stdframe($sp)
235238384Sjkim	ahi	$num,1		# restore $num, incidentally clears "borrow"
236238384Sjkim
237238384Sjkim	la	$j,0(%r0)
238238384Sjkim	lr	$count,$num
239238384Sjkim.Lsub:	lg	$alo,0($j,$ap)
240238384Sjkim	lg	$nlo,0($j,$np)
241238384Sjkim	_dswap	$nlo
242238384Sjkim	slbgr	$alo,$nlo
243238384Sjkim	stg	$alo,0($j,$rp)
244238384Sjkim	la	$j,8($j)
245238384Sjkim	brct	$count,.Lsub
246238384Sjkim	lghi	$ahi,0
247238384Sjkim	slbgr	$AHI,$ahi	# handle upmost carry
248238384Sjkim
249238384Sjkim	ngr	$ap,$AHI
250238384Sjkim	lghi	$np,-1
251238384Sjkim	xgr	$np,$AHI
252238384Sjkim	ngr	$np,$rp
253238384Sjkim	ogr	$ap,$np		# ap=borrow?tp:rp
254238384Sjkim
255238384Sjkim	la	$j,0(%r0)
256238384Sjkim	lgr	$count,$num
257238384Sjkim.Lcopy:	lg	$alo,0($j,$ap)		# copy or in-place refresh
258238384Sjkim	_dswap	$alo
259238384Sjkim	stg	$j,$stdframe($j,$sp)	# zap tp
260238384Sjkim	stg	$alo,0($j,$rp)
261238384Sjkim	la	$j,8($j)
262238384Sjkim	brct	$count,.Lcopy
263238384Sjkim
264238384Sjkim	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
265238384Sjkim	lm${g}	%r6,%r15,0(%r1)
266238384Sjkim	lghi	%r2,1		# signal "processed"
267238384Sjkim	br	%r14
268238384Sjkim.size	bn_mul_mont,.-bn_mul_mont
269238384Sjkim.string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
270238384Sjkim___
271238384Sjkim
272238384Sjkimforeach (split("\n",$code)) {
273238384Sjkim	s/\`([^\`]*)\`/eval $1/ge;
274238384Sjkim	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
275238384Sjkim	print $_,"\n";
276238384Sjkim}
277238384Sjkimclose STDOUT;
278