1238384Sjkim.ident "s390x.S, version 1.1"
2238384Sjkim// ====================================================================
3238384Sjkim// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4238384Sjkim// project.
5238384Sjkim//
6238384Sjkim// Rights for redistribution and usage in source and binary forms are
7238384Sjkim// granted according to the OpenSSL license. Warranty of any kind is
8238384Sjkim// disclaimed.
9238384Sjkim// ====================================================================
10238384Sjkim
11238384Sjkim.text
12238384Sjkim
13238384Sjkim#define zero	%r0
14238384Sjkim
15238384Sjkim// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
16238384Sjkim.globl	bn_mul_add_words
17238384Sjkim.type	bn_mul_add_words,@function
18238384Sjkim.align	4
19238384Sjkimbn_mul_add_words:
20238384Sjkim	lghi	zero,0		// zero = 0
21238384Sjkim	la	%r1,0(%r2)	// put rp aside
22238384Sjkim	lghi	%r2,0		// i=0;
23238384Sjkim	ltgfr	%r4,%r4
24238384Sjkim	bler	%r14		// if (len<=0) return 0;
25238384Sjkim
26238384Sjkim	stmg	%r6,%r10,48(%r15)
27238384Sjkim	lghi	%r10,3
28238384Sjkim	lghi	%r8,0		// carry = 0
29238384Sjkim	nr	%r10,%r4	// len%4
30238384Sjkim	sra	%r4,2		// cnt=len/4
31238384Sjkim	jz	.Loop1_madd	// carry is incidentally cleared if branch taken
32238384Sjkim	algr	zero,zero	// clear carry
33238384Sjkim
34238384Sjkim.Loop4_madd:
35238384Sjkim	lg	%r7,0(%r2,%r3)	// ap[i]
36238384Sjkim	mlgr	%r6,%r5		// *=w
37238384Sjkim	alcgr	%r7,%r8		// +=carry
38238384Sjkim	alcgr	%r6,zero
39238384Sjkim	alg	%r7,0(%r2,%r1)	// +=rp[i]
40238384Sjkim	stg	%r7,0(%r2,%r1)	// rp[i]=
41238384Sjkim
42238384Sjkim	lg	%r9,8(%r2,%r3)
43238384Sjkim	mlgr	%r8,%r5
44238384Sjkim	alcgr	%r9,%r6
45238384Sjkim	alcgr	%r8,zero
46238384Sjkim	alg	%r9,8(%r2,%r1)
47238384Sjkim	stg	%r9,8(%r2,%r1)
48238384Sjkim
49238384Sjkim	lg	%r7,16(%r2,%r3)
50238384Sjkim	mlgr	%r6,%r5
51238384Sjkim	alcgr	%r7,%r8
52238384Sjkim	alcgr	%r6,zero
53238384Sjkim	alg	%r7,16(%r2,%r1)
54238384Sjkim	stg	%r7,16(%r2,%r1)
55238384Sjkim
56238384Sjkim	lg	%r9,24(%r2,%r3)
57238384Sjkim	mlgr	%r8,%r5
58238384Sjkim	alcgr	%r9,%r6
59238384Sjkim	alcgr	%r8,zero
60238384Sjkim	alg	%r9,24(%r2,%r1)
61238384Sjkim	stg	%r9,24(%r2,%r1)
62238384Sjkim
63238384Sjkim	la	%r2,32(%r2)	// i+=4
64238384Sjkim	brct	%r4,.Loop4_madd
65238384Sjkim
66238384Sjkim	la	%r10,1(%r10)		// see if len%4 is zero ...
67238384Sjkim	brct	%r10,.Loop1_madd	// without touching condition code:-)
68238384Sjkim
69238384Sjkim.Lend_madd:
70238384Sjkim	alcgr	%r8,zero	// collect carry bit
71238384Sjkim	lgr	%r2,%r8
72238384Sjkim	lmg	%r6,%r10,48(%r15)
73238384Sjkim	br	%r14
74238384Sjkim
75238384Sjkim.Loop1_madd:
76238384Sjkim	lg	%r7,0(%r2,%r3)	// ap[i]
77238384Sjkim	mlgr	%r6,%r5		// *=w
78238384Sjkim	alcgr	%r7,%r8		// +=carry
79238384Sjkim	alcgr	%r6,zero
80238384Sjkim	alg	%r7,0(%r2,%r1)	// +=rp[i]
81238384Sjkim	stg	%r7,0(%r2,%r1)	// rp[i]=
82238384Sjkim
83238384Sjkim	lgr	%r8,%r6
84238384Sjkim	la	%r2,8(%r2)	// i++
85238384Sjkim	brct	%r10,.Loop1_madd
86238384Sjkim
87238384Sjkim	j	.Lend_madd
88238384Sjkim.size	bn_mul_add_words,.-bn_mul_add_words
89238384Sjkim
90238384Sjkim// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
91238384Sjkim.globl	bn_mul_words
92238384Sjkim.type	bn_mul_words,@function
93238384Sjkim.align	4
94238384Sjkimbn_mul_words:
95238384Sjkim	lghi	zero,0		// zero = 0
96238384Sjkim	la	%r1,0(%r2)	// put rp aside
97238384Sjkim	lghi	%r2,0		// i=0;
98238384Sjkim	ltgfr	%r4,%r4
99238384Sjkim	bler	%r14		// if (len<=0) return 0;
100238384Sjkim
101238384Sjkim	stmg	%r6,%r10,48(%r15)
102238384Sjkim	lghi	%r10,3
103238384Sjkim	lghi	%r8,0		// carry = 0
104238384Sjkim	nr	%r10,%r4	// len%4
105238384Sjkim	sra	%r4,2		// cnt=len/4
106238384Sjkim	jz	.Loop1_mul	// carry is incidentally cleared if branch taken
107238384Sjkim	algr	zero,zero	// clear carry
108238384Sjkim
109238384Sjkim.Loop4_mul:
110238384Sjkim	lg	%r7,0(%r2,%r3)	// ap[i]
111238384Sjkim	mlgr	%r6,%r5		// *=w
112238384Sjkim	alcgr	%r7,%r8		// +=carry
113238384Sjkim	stg	%r7,0(%r2,%r1)	// rp[i]=
114238384Sjkim
115238384Sjkim	lg	%r9,8(%r2,%r3)
116238384Sjkim	mlgr	%r8,%r5
117238384Sjkim	alcgr	%r9,%r6
118238384Sjkim	stg	%r9,8(%r2,%r1)
119238384Sjkim
120238384Sjkim	lg	%r7,16(%r2,%r3)
121238384Sjkim	mlgr	%r6,%r5
122238384Sjkim	alcgr	%r7,%r8
123238384Sjkim	stg	%r7,16(%r2,%r1)
124238384Sjkim
125238384Sjkim	lg	%r9,24(%r2,%r3)
126238384Sjkim	mlgr	%r8,%r5
127238384Sjkim	alcgr	%r9,%r6
128238384Sjkim	stg	%r9,24(%r2,%r1)
129238384Sjkim
130238384Sjkim	la	%r2,32(%r2)	// i+=4
131238384Sjkim	brct	%r4,.Loop4_mul
132238384Sjkim
133238384Sjkim	la	%r10,1(%r10)		// see if len%4 is zero ...
134238384Sjkim	brct	%r10,.Loop1_mul		// without touching condition code:-)
135238384Sjkim
136238384Sjkim.Lend_mul:
137238384Sjkim	alcgr	%r8,zero	// collect carry bit
138238384Sjkim	lgr	%r2,%r8
139238384Sjkim	lmg	%r6,%r10,48(%r15)
140238384Sjkim	br	%r14
141238384Sjkim
142238384Sjkim.Loop1_mul:
143238384Sjkim	lg	%r7,0(%r2,%r3)	// ap[i]
144238384Sjkim	mlgr	%r6,%r5		// *=w
145238384Sjkim	alcgr	%r7,%r8		// +=carry
146238384Sjkim	stg	%r7,0(%r2,%r1)	// rp[i]=
147238384Sjkim
148238384Sjkim	lgr	%r8,%r6
149238384Sjkim	la	%r2,8(%r2)	// i++
150238384Sjkim	brct	%r10,.Loop1_mul
151238384Sjkim
152238384Sjkim	j	.Lend_mul
153238384Sjkim.size	bn_mul_words,.-bn_mul_words
154238384Sjkim
155238384Sjkim// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
156238384Sjkim.globl	bn_sqr_words
157238384Sjkim.type	bn_sqr_words,@function
158238384Sjkim.align	4
159238384Sjkimbn_sqr_words:
160238384Sjkim	ltgfr	%r4,%r4
161238384Sjkim	bler	%r14
162238384Sjkim
163238384Sjkim	stmg	%r6,%r7,48(%r15)
164238384Sjkim	srag	%r1,%r4,2	// cnt=len/4
165238384Sjkim	jz	.Loop1_sqr
166238384Sjkim
167238384Sjkim.Loop4_sqr:
168238384Sjkim	lg	%r7,0(%r3)
169238384Sjkim	mlgr	%r6,%r7
170238384Sjkim	stg	%r7,0(%r2)
171238384Sjkim	stg	%r6,8(%r2)
172238384Sjkim
173238384Sjkim	lg	%r7,8(%r3)
174238384Sjkim	mlgr	%r6,%r7
175238384Sjkim	stg	%r7,16(%r2)
176238384Sjkim	stg	%r6,24(%r2)
177238384Sjkim
178238384Sjkim	lg	%r7,16(%r3)
179238384Sjkim	mlgr	%r6,%r7
180238384Sjkim	stg	%r7,32(%r2)
181238384Sjkim	stg	%r6,40(%r2)
182238384Sjkim
183238384Sjkim	lg	%r7,24(%r3)
184238384Sjkim	mlgr	%r6,%r7
185238384Sjkim	stg	%r7,48(%r2)
186238384Sjkim	stg	%r6,56(%r2)
187238384Sjkim
188238384Sjkim	la	%r3,32(%r3)
189238384Sjkim	la	%r2,64(%r2)
190238384Sjkim	brct	%r1,.Loop4_sqr
191238384Sjkim
192238384Sjkim	lghi	%r1,3
193238384Sjkim	nr	%r4,%r1		// cnt=len%4
194238384Sjkim	jz	.Lend_sqr
195238384Sjkim
196238384Sjkim.Loop1_sqr:
197238384Sjkim	lg	%r7,0(%r3)
198238384Sjkim	mlgr	%r6,%r7
199238384Sjkim	stg	%r7,0(%r2)
200238384Sjkim	stg	%r6,8(%r2)
201238384Sjkim
202238384Sjkim	la	%r3,8(%r3)
203238384Sjkim	la	%r2,16(%r2)
204238384Sjkim	brct	%r4,.Loop1_sqr
205238384Sjkim
206238384Sjkim.Lend_sqr:
207238384Sjkim	lmg	%r6,%r7,48(%r15)
208238384Sjkim	br	%r14
209238384Sjkim.size	bn_sqr_words,.-bn_sqr_words
210238384Sjkim
211238384Sjkim// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
212238384Sjkim.globl	bn_div_words
213238384Sjkim.type	bn_div_words,@function
214238384Sjkim.align	4
215238384Sjkimbn_div_words:
216238384Sjkim	dlgr	%r2,%r4
217238384Sjkim	lgr	%r2,%r3
218238384Sjkim	br	%r14
219238384Sjkim.size	bn_div_words,.-bn_div_words
220238384Sjkim
221238384Sjkim// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
222238384Sjkim.globl	bn_add_words
223238384Sjkim.type	bn_add_words,@function
224238384Sjkim.align	4
225238384Sjkimbn_add_words:
226238384Sjkim	la	%r1,0(%r2)	// put rp aside
227238384Sjkim	lghi	%r2,0		// i=0
228238384Sjkim	ltgfr	%r5,%r5
229238384Sjkim	bler	%r14		// if (len<=0) return 0;
230238384Sjkim
231238384Sjkim	stg	%r6,48(%r15)
232238384Sjkim	lghi	%r6,3
233238384Sjkim	nr	%r6,%r5		// len%4
234238384Sjkim	sra	%r5,2		// len/4, use sra because it sets condition code
235238384Sjkim	jz	.Loop1_add	// carry is incidentally cleared if branch taken
236238384Sjkim	algr	%r2,%r2		// clear carry
237238384Sjkim
238238384Sjkim.Loop4_add:
239238384Sjkim	lg	%r0,0(%r2,%r3)
240238384Sjkim	alcg	%r0,0(%r2,%r4)
241238384Sjkim	stg	%r0,0(%r2,%r1)
242238384Sjkim	lg	%r0,8(%r2,%r3)
243238384Sjkim	alcg	%r0,8(%r2,%r4)
244238384Sjkim	stg	%r0,8(%r2,%r1)
245238384Sjkim	lg	%r0,16(%r2,%r3)
246238384Sjkim	alcg	%r0,16(%r2,%r4)
247238384Sjkim	stg	%r0,16(%r2,%r1)
248238384Sjkim	lg	%r0,24(%r2,%r3)
249238384Sjkim	alcg	%r0,24(%r2,%r4)
250238384Sjkim	stg	%r0,24(%r2,%r1)
251238384Sjkim
252238384Sjkim	la	%r2,32(%r2)	// i+=4
253238384Sjkim	brct	%r5,.Loop4_add
254238384Sjkim
255238384Sjkim	la	%r6,1(%r6)	// see if len%4 is zero ...
256238384Sjkim	brct	%r6,.Loop1_add	// without touching condition code:-)
257238384Sjkim
258238384Sjkim.Lexit_add:
259238384Sjkim	lghi	%r2,0
260238384Sjkim	alcgr	%r2,%r2
261238384Sjkim	lg	%r6,48(%r15)
262238384Sjkim	br	%r14
263238384Sjkim
264238384Sjkim.Loop1_add:
265238384Sjkim	lg	%r0,0(%r2,%r3)
266238384Sjkim	alcg	%r0,0(%r2,%r4)
267238384Sjkim	stg	%r0,0(%r2,%r1)
268238384Sjkim
269238384Sjkim	la	%r2,8(%r2)	// i++
270238384Sjkim	brct	%r6,.Loop1_add
271238384Sjkim
272238384Sjkim	j	.Lexit_add
273238384Sjkim.size	bn_add_words,.-bn_add_words
274238384Sjkim
275238384Sjkim// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
276238384Sjkim.globl	bn_sub_words
277238384Sjkim.type	bn_sub_words,@function
278238384Sjkim.align	4
279238384Sjkimbn_sub_words:
280238384Sjkim	la	%r1,0(%r2)	// put rp aside
281238384Sjkim	lghi	%r2,0		// i=0
282238384Sjkim	ltgfr	%r5,%r5
283238384Sjkim	bler	%r14		// if (len<=0) return 0;
284238384Sjkim
285238384Sjkim	stg	%r6,48(%r15)
286238384Sjkim	lghi	%r6,3
287238384Sjkim	nr	%r6,%r5		// len%4
288238384Sjkim	sra	%r5,2		// len/4, use sra because it sets condition code
289238384Sjkim	jnz	.Loop4_sub	// borrow is incidentally cleared if branch taken
290238384Sjkim	slgr	%r2,%r2		// clear borrow
291238384Sjkim
292238384Sjkim.Loop1_sub:
293238384Sjkim	lg	%r0,0(%r2,%r3)
294238384Sjkim	slbg	%r0,0(%r2,%r4)
295238384Sjkim	stg	%r0,0(%r2,%r1)
296238384Sjkim
297238384Sjkim	la	%r2,8(%r2)	// i++
298238384Sjkim	brct	%r6,.Loop1_sub
299238384Sjkim	j	.Lexit_sub
300238384Sjkim
301238384Sjkim.Loop4_sub:
302238384Sjkim	lg	%r0,0(%r2,%r3)
303238384Sjkim	slbg	%r0,0(%r2,%r4)
304238384Sjkim	stg	%r0,0(%r2,%r1)
305238384Sjkim	lg	%r0,8(%r2,%r3)
306238384Sjkim	slbg	%r0,8(%r2,%r4)
307238384Sjkim	stg	%r0,8(%r2,%r1)
308238384Sjkim	lg	%r0,16(%r2,%r3)
309238384Sjkim	slbg	%r0,16(%r2,%r4)
310238384Sjkim	stg	%r0,16(%r2,%r1)
311238384Sjkim	lg	%r0,24(%r2,%r3)
312238384Sjkim	slbg	%r0,24(%r2,%r4)
313238384Sjkim	stg	%r0,24(%r2,%r1)
314238384Sjkim
315238384Sjkim	la	%r2,32(%r2)	// i+=4
316238384Sjkim	brct	%r5,.Loop4_sub
317238384Sjkim
318238384Sjkim	la	%r6,1(%r6)	// see if len%4 is zero ...
319238384Sjkim	brct	%r6,.Loop1_sub	// without touching condition code:-)
320238384Sjkim
321238384Sjkim.Lexit_sub:
322238384Sjkim	lghi	%r2,0
323238384Sjkim	slbgr	%r2,%r2
324238384Sjkim	lcgr	%r2,%r2
325238384Sjkim	lg	%r6,48(%r15)
326238384Sjkim	br	%r14
327238384Sjkim.size	bn_sub_words,.-bn_sub_words
328238384Sjkim
329238384Sjkim#define c1	%r1
330238384Sjkim#define c2	%r5
331238384Sjkim#define c3	%r8
332238384Sjkim
333238384Sjkim#define mul_add_c(ai,bi,c1,c2,c3)	\
334238384Sjkim	lg	%r7,ai*8(%r3);		\
335238384Sjkim	mlg	%r6,bi*8(%r4);		\
336238384Sjkim	algr	c1,%r7;			\
337238384Sjkim	alcgr	c2,%r6;			\
338238384Sjkim	alcgr	c3,zero
339238384Sjkim
340238384Sjkim// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
341238384Sjkim.globl	bn_mul_comba8
342238384Sjkim.type	bn_mul_comba8,@function
343238384Sjkim.align	4
344238384Sjkimbn_mul_comba8:
345238384Sjkim	stmg	%r6,%r8,48(%r15)
346238384Sjkim
347238384Sjkim	lghi	c1,0
348238384Sjkim	lghi	c2,0
349238384Sjkim	lghi	c3,0
350238384Sjkim	lghi	zero,0
351238384Sjkim
352238384Sjkim	mul_add_c(0,0,c1,c2,c3);
353238384Sjkim	stg	c1,0*8(%r2)
354238384Sjkim	lghi	c1,0
355238384Sjkim
356238384Sjkim	mul_add_c(0,1,c2,c3,c1);
357238384Sjkim	mul_add_c(1,0,c2,c3,c1);
358238384Sjkim	stg	c2,1*8(%r2)
359238384Sjkim	lghi	c2,0
360238384Sjkim
361238384Sjkim	mul_add_c(2,0,c3,c1,c2);
362238384Sjkim	mul_add_c(1,1,c3,c1,c2);
363238384Sjkim	mul_add_c(0,2,c3,c1,c2);
364238384Sjkim	stg	c3,2*8(%r2)
365238384Sjkim	lghi	c3,0
366238384Sjkim
367238384Sjkim	mul_add_c(0,3,c1,c2,c3);
368238384Sjkim	mul_add_c(1,2,c1,c2,c3);
369238384Sjkim	mul_add_c(2,1,c1,c2,c3);
370238384Sjkim	mul_add_c(3,0,c1,c2,c3);
371238384Sjkim	stg	c1,3*8(%r2)
372238384Sjkim	lghi	c1,0
373238384Sjkim
374238384Sjkim	mul_add_c(4,0,c2,c3,c1);
375238384Sjkim	mul_add_c(3,1,c2,c3,c1);
376238384Sjkim	mul_add_c(2,2,c2,c3,c1);
377238384Sjkim	mul_add_c(1,3,c2,c3,c1);
378238384Sjkim	mul_add_c(0,4,c2,c3,c1);
379238384Sjkim	stg	c2,4*8(%r2)
380238384Sjkim	lghi	c2,0
381238384Sjkim
382238384Sjkim	mul_add_c(0,5,c3,c1,c2);
383238384Sjkim	mul_add_c(1,4,c3,c1,c2);
384238384Sjkim	mul_add_c(2,3,c3,c1,c2);
385238384Sjkim	mul_add_c(3,2,c3,c1,c2);
386238384Sjkim	mul_add_c(4,1,c3,c1,c2);
387238384Sjkim	mul_add_c(5,0,c3,c1,c2);
388238384Sjkim	stg	c3,5*8(%r2)
389238384Sjkim	lghi	c3,0
390238384Sjkim
391238384Sjkim	mul_add_c(6,0,c1,c2,c3);
392238384Sjkim	mul_add_c(5,1,c1,c2,c3);
393238384Sjkim	mul_add_c(4,2,c1,c2,c3);
394238384Sjkim	mul_add_c(3,3,c1,c2,c3);
395238384Sjkim	mul_add_c(2,4,c1,c2,c3);
396238384Sjkim	mul_add_c(1,5,c1,c2,c3);
397238384Sjkim	mul_add_c(0,6,c1,c2,c3);
398238384Sjkim	stg	c1,6*8(%r2)
399238384Sjkim	lghi	c1,0
400238384Sjkim
401238384Sjkim	mul_add_c(0,7,c2,c3,c1);
402238384Sjkim	mul_add_c(1,6,c2,c3,c1);
403238384Sjkim	mul_add_c(2,5,c2,c3,c1);
404238384Sjkim	mul_add_c(3,4,c2,c3,c1);
405238384Sjkim	mul_add_c(4,3,c2,c3,c1);
406238384Sjkim	mul_add_c(5,2,c2,c3,c1);
407238384Sjkim	mul_add_c(6,1,c2,c3,c1);
408238384Sjkim	mul_add_c(7,0,c2,c3,c1);
409238384Sjkim	stg	c2,7*8(%r2)
410238384Sjkim	lghi	c2,0
411238384Sjkim
412238384Sjkim	mul_add_c(7,1,c3,c1,c2);
413238384Sjkim	mul_add_c(6,2,c3,c1,c2);
414238384Sjkim	mul_add_c(5,3,c3,c1,c2);
415238384Sjkim	mul_add_c(4,4,c3,c1,c2);
416238384Sjkim	mul_add_c(3,5,c3,c1,c2);
417238384Sjkim	mul_add_c(2,6,c3,c1,c2);
418238384Sjkim	mul_add_c(1,7,c3,c1,c2);
419238384Sjkim	stg	c3,8*8(%r2)
420238384Sjkim	lghi	c3,0
421238384Sjkim
422238384Sjkim	mul_add_c(2,7,c1,c2,c3);
423238384Sjkim	mul_add_c(3,6,c1,c2,c3);
424238384Sjkim	mul_add_c(4,5,c1,c2,c3);
425238384Sjkim	mul_add_c(5,4,c1,c2,c3);
426238384Sjkim	mul_add_c(6,3,c1,c2,c3);
427238384Sjkim	mul_add_c(7,2,c1,c2,c3);
428238384Sjkim	stg	c1,9*8(%r2)
429238384Sjkim	lghi	c1,0
430238384Sjkim
431238384Sjkim	mul_add_c(7,3,c2,c3,c1);
432238384Sjkim	mul_add_c(6,4,c2,c3,c1);
433238384Sjkim	mul_add_c(5,5,c2,c3,c1);
434238384Sjkim	mul_add_c(4,6,c2,c3,c1);
435238384Sjkim	mul_add_c(3,7,c2,c3,c1);
436238384Sjkim	stg	c2,10*8(%r2)
437238384Sjkim	lghi	c2,0
438238384Sjkim
439238384Sjkim	mul_add_c(4,7,c3,c1,c2);
440238384Sjkim	mul_add_c(5,6,c3,c1,c2);
441238384Sjkim	mul_add_c(6,5,c3,c1,c2);
442238384Sjkim	mul_add_c(7,4,c3,c1,c2);
443238384Sjkim	stg	c3,11*8(%r2)
444238384Sjkim	lghi	c3,0
445238384Sjkim
446238384Sjkim	mul_add_c(7,5,c1,c2,c3);
447238384Sjkim	mul_add_c(6,6,c1,c2,c3);
448238384Sjkim	mul_add_c(5,7,c1,c2,c3);
449238384Sjkim	stg	c1,12*8(%r2)
450238384Sjkim	lghi	c1,0
451238384Sjkim
452238384Sjkim
453238384Sjkim	mul_add_c(6,7,c2,c3,c1);
454238384Sjkim	mul_add_c(7,6,c2,c3,c1);
455238384Sjkim	stg	c2,13*8(%r2)
456238384Sjkim	lghi	c2,0
457238384Sjkim
458238384Sjkim	mul_add_c(7,7,c3,c1,c2);
459238384Sjkim	stg	c3,14*8(%r2)
460238384Sjkim	stg	c1,15*8(%r2)
461238384Sjkim
462238384Sjkim	lmg	%r6,%r8,48(%r15)
463238384Sjkim	br	%r14
464238384Sjkim.size	bn_mul_comba8,.-bn_mul_comba8
465238384Sjkim
466238384Sjkim// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
467238384Sjkim.globl	bn_mul_comba4
468238384Sjkim.type	bn_mul_comba4,@function
469238384Sjkim.align	4
470238384Sjkimbn_mul_comba4:
471238384Sjkim	stmg	%r6,%r8,48(%r15)
472238384Sjkim
473238384Sjkim	lghi	c1,0
474238384Sjkim	lghi	c2,0
475238384Sjkim	lghi	c3,0
476238384Sjkim	lghi	zero,0
477238384Sjkim
478238384Sjkim	mul_add_c(0,0,c1,c2,c3);
479238384Sjkim	stg	c1,0*8(%r3)
480238384Sjkim	lghi	c1,0
481238384Sjkim
482238384Sjkim	mul_add_c(0,1,c2,c3,c1);
483238384Sjkim	mul_add_c(1,0,c2,c3,c1);
484238384Sjkim	stg	c2,1*8(%r2)
485238384Sjkim	lghi	c2,0
486238384Sjkim
487238384Sjkim	mul_add_c(2,0,c3,c1,c2);
488238384Sjkim	mul_add_c(1,1,c3,c1,c2);
489238384Sjkim	mul_add_c(0,2,c3,c1,c2);
490238384Sjkim	stg	c3,2*8(%r2)
491238384Sjkim	lghi	c3,0
492238384Sjkim
493238384Sjkim	mul_add_c(0,3,c1,c2,c3);
494238384Sjkim	mul_add_c(1,2,c1,c2,c3);
495238384Sjkim	mul_add_c(2,1,c1,c2,c3);
496238384Sjkim	mul_add_c(3,0,c1,c2,c3);
497238384Sjkim	stg	c1,3*8(%r2)
498238384Sjkim	lghi	c1,0
499238384Sjkim
500238384Sjkim	mul_add_c(3,1,c2,c3,c1);
501238384Sjkim	mul_add_c(2,2,c2,c3,c1);
502238384Sjkim	mul_add_c(1,3,c2,c3,c1);
503238384Sjkim	stg	c2,4*8(%r2)
504238384Sjkim	lghi	c2,0
505238384Sjkim
506238384Sjkim	mul_add_c(2,3,c3,c1,c2);
507238384Sjkim	mul_add_c(3,2,c3,c1,c2);
508238384Sjkim	stg	c3,5*8(%r2)
509238384Sjkim	lghi	c3,0
510238384Sjkim
511238384Sjkim	mul_add_c(3,3,c1,c2,c3);
512238384Sjkim	stg	c1,6*8(%r2)
513238384Sjkim	stg	c2,7*8(%r2)
514238384Sjkim
515238384Sjkim	stmg	%r6,%r8,48(%r15)
516238384Sjkim	br	%r14
517238384Sjkim.size	bn_mul_comba4,.-bn_mul_comba4
518238384Sjkim
519238384Sjkim#define sqr_add_c(ai,c1,c2,c3)		\
520238384Sjkim	lg	%r7,ai*8(%r3);		\
521238384Sjkim	mlgr	%r6,%r7;		\
522238384Sjkim	algr	c1,%r7;			\
523238384Sjkim	alcgr	c2,%r6;			\
524238384Sjkim	alcgr	c3,zero
525238384Sjkim
526238384Sjkim#define sqr_add_c2(ai,aj,c1,c2,c3)	\
527238384Sjkim	lg	%r7,ai*8(%r3);		\
528238384Sjkim	mlg	%r6,aj*8(%r3);		\
529238384Sjkim	algr	c1,%r7;			\
530238384Sjkim	alcgr	c2,%r6;			\
531238384Sjkim	alcgr	c3,zero;		\
532238384Sjkim	algr	c1,%r7;			\
533238384Sjkim	alcgr	c2,%r6;			\
534238384Sjkim	alcgr	c3,zero
535238384Sjkim
536238384Sjkim// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
537238384Sjkim.globl	bn_sqr_comba8
538238384Sjkim.type	bn_sqr_comba8,@function
539238384Sjkim.align	4
540238384Sjkimbn_sqr_comba8:
541238384Sjkim	stmg	%r6,%r8,48(%r15)
542238384Sjkim
543238384Sjkim	lghi	c1,0
544238384Sjkim	lghi	c2,0
545238384Sjkim	lghi	c3,0
546238384Sjkim	lghi	zero,0
547238384Sjkim
548238384Sjkim	sqr_add_c(0,c1,c2,c3);
549238384Sjkim	stg	c1,0*8(%r2)
550238384Sjkim	lghi	c1,0
551238384Sjkim
552238384Sjkim	sqr_add_c2(1,0,c2,c3,c1);
553238384Sjkim	stg	c2,1*8(%r2)
554238384Sjkim	lghi	c2,0
555238384Sjkim
556238384Sjkim	sqr_add_c(1,c3,c1,c2);
557238384Sjkim	sqr_add_c2(2,0,c3,c1,c2);
558238384Sjkim	stg	c3,2*8(%r2)
559238384Sjkim	lghi	c3,0
560238384Sjkim
561238384Sjkim	sqr_add_c2(3,0,c1,c2,c3);
562238384Sjkim	sqr_add_c2(2,1,c1,c2,c3);
563238384Sjkim	stg	c1,3*8(%r2)
564238384Sjkim	lghi	c1,0
565238384Sjkim
566238384Sjkim	sqr_add_c(2,c2,c3,c1);
567238384Sjkim	sqr_add_c2(3,1,c2,c3,c1);
568238384Sjkim	sqr_add_c2(4,0,c2,c3,c1);
569238384Sjkim	stg	c2,4*8(%r2)
570238384Sjkim	lghi	c2,0
571238384Sjkim
572238384Sjkim	sqr_add_c2(5,0,c3,c1,c2);
573238384Sjkim	sqr_add_c2(4,1,c3,c1,c2);
574238384Sjkim	sqr_add_c2(3,2,c3,c1,c2);
575238384Sjkim	stg	c3,5*8(%r2)
576238384Sjkim	lghi	c3,0
577238384Sjkim
578238384Sjkim	sqr_add_c(3,c1,c2,c3);
579238384Sjkim	sqr_add_c2(4,2,c1,c2,c3);
580238384Sjkim	sqr_add_c2(5,1,c1,c2,c3);
581238384Sjkim	sqr_add_c2(6,0,c1,c2,c3);
582238384Sjkim	stg	c1,6*8(%r2)
583238384Sjkim	lghi	c1,0
584238384Sjkim
585238384Sjkim	sqr_add_c2(7,0,c2,c3,c1);
586238384Sjkim	sqr_add_c2(6,1,c2,c3,c1);
587238384Sjkim	sqr_add_c2(5,2,c2,c3,c1);
588238384Sjkim	sqr_add_c2(4,3,c2,c3,c1);
589238384Sjkim	stg	c2,7*8(%r2)
590238384Sjkim	lghi	c2,0
591238384Sjkim
592238384Sjkim	sqr_add_c(4,c3,c1,c2);
593238384Sjkim	sqr_add_c2(5,3,c3,c1,c2);
594238384Sjkim	sqr_add_c2(6,2,c3,c1,c2);
595238384Sjkim	sqr_add_c2(7,1,c3,c1,c2);
596238384Sjkim	stg	c3,8*8(%r2)
597238384Sjkim	lghi	c3,0
598238384Sjkim
599238384Sjkim	sqr_add_c2(7,2,c1,c2,c3);
600238384Sjkim	sqr_add_c2(6,3,c1,c2,c3);
601238384Sjkim	sqr_add_c2(5,4,c1,c2,c3);
602238384Sjkim	stg	c1,9*8(%r2)
603238384Sjkim	lghi	c1,0
604238384Sjkim
605238384Sjkim	sqr_add_c(5,c2,c3,c1);
606238384Sjkim	sqr_add_c2(6,4,c2,c3,c1);
607238384Sjkim	sqr_add_c2(7,3,c2,c3,c1);
608238384Sjkim	stg	c2,10*8(%r2)
609238384Sjkim	lghi	c2,0
610238384Sjkim
611238384Sjkim	sqr_add_c2(7,4,c3,c1,c2);
612238384Sjkim	sqr_add_c2(6,5,c3,c1,c2);
613238384Sjkim	stg	c3,11*8(%r2)
614238384Sjkim	lghi	c3,0
615238384Sjkim
616238384Sjkim	sqr_add_c(6,c1,c2,c3);
617238384Sjkim	sqr_add_c2(7,5,c1,c2,c3);
618238384Sjkim	stg	c1,12*8(%r2)
619238384Sjkim	lghi	c1,0
620238384Sjkim
621238384Sjkim	sqr_add_c2(7,6,c2,c3,c1);
622238384Sjkim	stg	c2,13*8(%r2)
623238384Sjkim	lghi	c2,0
624238384Sjkim
625238384Sjkim	sqr_add_c(7,c3,c1,c2);
626238384Sjkim	stg	c3,14*8(%r2)
627238384Sjkim	stg	c1,15*8(%r2)
628238384Sjkim
629238384Sjkim	lmg	%r6,%r8,48(%r15)
630238384Sjkim	br	%r14
631238384Sjkim.size	bn_sqr_comba8,.-bn_sqr_comba8
632238384Sjkim
633238384Sjkim// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
634238384Sjkim.globl bn_sqr_comba4
635238384Sjkim.type	bn_sqr_comba4,@function
636238384Sjkim.align	4
637238384Sjkimbn_sqr_comba4:
638238384Sjkim	stmg	%r6,%r8,48(%r15)
639238384Sjkim
640238384Sjkim	lghi	c1,0
641238384Sjkim	lghi	c2,0
642238384Sjkim	lghi	c3,0
643238384Sjkim	lghi	zero,0
644238384Sjkim
645238384Sjkim	sqr_add_c(0,c1,c2,c3);
646238384Sjkim	stg	c1,0*8(%r2)
647238384Sjkim	lghi	c1,0
648238384Sjkim
649238384Sjkim	sqr_add_c2(1,0,c2,c3,c1);
650238384Sjkim	stg	c2,1*8(%r2)
651238384Sjkim	lghi	c2,0
652238384Sjkim
653238384Sjkim	sqr_add_c(1,c3,c1,c2);
654238384Sjkim	sqr_add_c2(2,0,c3,c1,c2);
655238384Sjkim	stg	c3,2*8(%r2)
656238384Sjkim	lghi	c3,0
657238384Sjkim
658238384Sjkim	sqr_add_c2(3,0,c1,c2,c3);
659238384Sjkim	sqr_add_c2(2,1,c1,c2,c3);
660238384Sjkim	stg	c1,3*8(%r2)
661238384Sjkim	lghi	c1,0
662238384Sjkim
663238384Sjkim	sqr_add_c(2,c2,c3,c1);
664238384Sjkim	sqr_add_c2(3,1,c2,c3,c1);
665238384Sjkim	stg	c2,4*8(%r2)
666238384Sjkim	lghi	c2,0
667238384Sjkim
668238384Sjkim	sqr_add_c2(3,2,c3,c1,c2);
669238384Sjkim	stg	c3,5*8(%r2)
670238384Sjkim	lghi	c3,0
671238384Sjkim
672238384Sjkim	sqr_add_c(3,c1,c2,c3);
673238384Sjkim	stg	c1,6*8(%r2)
674238384Sjkim	stg	c2,7*8(%r2)
675238384Sjkim
676238384Sjkim	lmg	%r6,%r8,48(%r15)
677238384Sjkim	br	%r14
678238384Sjkim.size	bn_sqr_comba4,.-bn_sqr_comba4
679