1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim
10238384Sjkim# I let hardware handle unaligned input(*), except on page boundaries
11238384Sjkim# (see below for details). Otherwise straightforward implementation
12238384Sjkim# with X vector in register bank. The module is big-endian [which is
13238384Sjkim# not big deal as there're no little-endian targets left around].
14238384Sjkim#
15238384Sjkim# (*) this means that this module is inappropriate for PPC403? Does
16238384Sjkim#     anybody know if pre-POWER3 can sustain unaligned load?
17238384Sjkim
18238384Sjkim# 			-m64	-m32
19238384Sjkim# ----------------------------------
20238384Sjkim# PPC970,gcc-4.0.0	+76%	+59%
21238384Sjkim# Power6,xlc-7		+68%	+33%
22238384Sjkim
23238384Sjkim$flavour = shift;
24238384Sjkim
25238384Sjkimif ($flavour =~ /64/) {
26238384Sjkim	$SIZE_T	=8;
27238384Sjkim	$LRSAVE	=2*$SIZE_T;
28238384Sjkim	$UCMP	="cmpld";
29238384Sjkim	$STU	="stdu";
30238384Sjkim	$POP	="ld";
31238384Sjkim	$PUSH	="std";
32238384Sjkim} elsif ($flavour =~ /32/) {
33238384Sjkim	$SIZE_T	=4;
34238384Sjkim	$LRSAVE	=$SIZE_T;
35238384Sjkim	$UCMP	="cmplw";
36238384Sjkim	$STU	="stwu";
37238384Sjkim	$POP	="lwz";
38238384Sjkim	$PUSH	="stw";
39238384Sjkim} else { die "nonsense $flavour"; }
40238384Sjkim
41238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42238384Sjkim( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
43238384Sjkim( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
44238384Sjkimdie "can't locate ppc-xlate.pl";
45238384Sjkim
46238384Sjkimopen STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
47238384Sjkim
48238384Sjkim$FRAME=24*$SIZE_T+64;
49238384Sjkim$LOCALS=6*$SIZE_T;
50238384Sjkim
51238384Sjkim$K  ="r0";
52238384Sjkim$sp ="r1";
53238384Sjkim$toc="r2";
54238384Sjkim$ctx="r3";
55238384Sjkim$inp="r4";
56238384Sjkim$num="r5";
57238384Sjkim$t0 ="r15";
58238384Sjkim$t1 ="r6";
59238384Sjkim
60238384Sjkim$A  ="r7";
61238384Sjkim$B  ="r8";
62238384Sjkim$C  ="r9";
63238384Sjkim$D  ="r10";
64238384Sjkim$E  ="r11";
65238384Sjkim$T  ="r12";
66238384Sjkim
67238384Sjkim@V=($A,$B,$C,$D,$E,$T);
68238384Sjkim@X=("r16","r17","r18","r19","r20","r21","r22","r23",
69238384Sjkim    "r24","r25","r26","r27","r28","r29","r30","r31");
70238384Sjkim
71238384Sjkimsub BODY_00_19 {
72238384Sjkimmy ($i,$a,$b,$c,$d,$e,$f)=@_;
73238384Sjkimmy $j=$i+1;
74238384Sjkim$code.=<<___ if ($i==0);
75238384Sjkim	lwz	@X[$i],`$i*4`($inp)
76238384Sjkim___
77238384Sjkim$code.=<<___ if ($i<15);
78238384Sjkim	lwz	@X[$j],`$j*4`($inp)
79238384Sjkim	add	$f,$K,$e
80238384Sjkim	rotlwi	$e,$a,5
81238384Sjkim	add	$f,$f,@X[$i]
82238384Sjkim	and	$t0,$c,$b
83238384Sjkim	add	$f,$f,$e
84238384Sjkim	andc	$t1,$d,$b
85238384Sjkim	rotlwi	$b,$b,30
86238384Sjkim	or	$t0,$t0,$t1
87238384Sjkim	add	$f,$f,$t0
88238384Sjkim___
89238384Sjkim$code.=<<___ if ($i>=15);
90238384Sjkim	add	$f,$K,$e
91238384Sjkim	rotlwi	$e,$a,5
92238384Sjkim	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
93238384Sjkim	add	$f,$f,@X[$i%16]
94238384Sjkim	and	$t0,$c,$b
95238384Sjkim	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
96238384Sjkim	add	$f,$f,$e
97238384Sjkim	andc	$t1,$d,$b
98238384Sjkim	rotlwi	$b,$b,30
99238384Sjkim	or	$t0,$t0,$t1
100238384Sjkim	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
101238384Sjkim	add	$f,$f,$t0
102238384Sjkim	rotlwi	@X[$j%16],@X[$j%16],1
103238384Sjkim___
104238384Sjkim}
105238384Sjkim
106238384Sjkimsub BODY_20_39 {
107238384Sjkimmy ($i,$a,$b,$c,$d,$e,$f)=@_;
108238384Sjkimmy $j=$i+1;
109238384Sjkim$code.=<<___ if ($i<79);
110238384Sjkim	add	$f,$K,$e
111238384Sjkim	rotlwi	$e,$a,5
112238384Sjkim	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
113238384Sjkim	add	$f,$f,@X[$i%16]
114238384Sjkim	xor	$t0,$b,$c
115238384Sjkim	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
116238384Sjkim	add	$f,$f,$e
117238384Sjkim	rotlwi	$b,$b,30
118238384Sjkim	xor	$t0,$t0,$d
119238384Sjkim	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
120238384Sjkim	add	$f,$f,$t0
121238384Sjkim	rotlwi	@X[$j%16],@X[$j%16],1
122238384Sjkim___
123238384Sjkim$code.=<<___ if ($i==79);
124238384Sjkim	add	$f,$K,$e
125238384Sjkim	rotlwi	$e,$a,5
126238384Sjkim	lwz	r16,0($ctx)
127238384Sjkim	add	$f,$f,@X[$i%16]
128238384Sjkim	xor	$t0,$b,$c
129238384Sjkim	lwz	r17,4($ctx)
130238384Sjkim	add	$f,$f,$e
131238384Sjkim	rotlwi	$b,$b,30
132238384Sjkim	lwz	r18,8($ctx)
133238384Sjkim	xor	$t0,$t0,$d
134238384Sjkim	lwz	r19,12($ctx)
135238384Sjkim	add	$f,$f,$t0
136238384Sjkim	lwz	r20,16($ctx)
137238384Sjkim___
138238384Sjkim}
139238384Sjkim
140238384Sjkimsub BODY_40_59 {
141238384Sjkimmy ($i,$a,$b,$c,$d,$e,$f)=@_;
142238384Sjkimmy $j=$i+1;
143238384Sjkim$code.=<<___;
144238384Sjkim	add	$f,$K,$e
145238384Sjkim	rotlwi	$e,$a,5
146238384Sjkim	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
147238384Sjkim	add	$f,$f,@X[$i%16]
148238384Sjkim	and	$t0,$b,$c
149238384Sjkim	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
150238384Sjkim	add	$f,$f,$e
151238384Sjkim	or	$t1,$b,$c
152238384Sjkim	rotlwi	$b,$b,30
153238384Sjkim	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
154238384Sjkim	and	$t1,$t1,$d
155238384Sjkim	or	$t0,$t0,$t1
156238384Sjkim	rotlwi	@X[$j%16],@X[$j%16],1
157238384Sjkim	add	$f,$f,$t0
158238384Sjkim___
159238384Sjkim}
160238384Sjkim
161238384Sjkim$code=<<___;
162238384Sjkim.machine	"any"
163238384Sjkim.text
164238384Sjkim
165238384Sjkim.globl	.sha1_block_data_order
166238384Sjkim.align	4
167238384Sjkim.sha1_block_data_order:
168238384Sjkim	$STU	$sp,-$FRAME($sp)
169238384Sjkim	mflr	r0
170238384Sjkim	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
171238384Sjkim	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
172238384Sjkim	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
173238384Sjkim	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
174238384Sjkim	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
175238384Sjkim	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
176238384Sjkim	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
177238384Sjkim	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
178238384Sjkim	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
179238384Sjkim	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
180238384Sjkim	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
181238384Sjkim	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
182238384Sjkim	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
183238384Sjkim	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
184238384Sjkim	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
185238384Sjkim	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
186238384Sjkim	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
187238384Sjkim	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
188238384Sjkim	lwz	$A,0($ctx)
189238384Sjkim	lwz	$B,4($ctx)
190238384Sjkim	lwz	$C,8($ctx)
191238384Sjkim	lwz	$D,12($ctx)
192238384Sjkim	lwz	$E,16($ctx)
193238384Sjkim	andi.	r0,$inp,3
194238384Sjkim	bne	Lunaligned
195238384SjkimLaligned:
196238384Sjkim	mtctr	$num
197238384Sjkim	bl	Lsha1_block_private
198238384Sjkim	b	Ldone
199238384Sjkim
200238384Sjkim; PowerPC specification allows an implementation to be ill-behaved
201238384Sjkim; upon unaligned access which crosses page boundary. "Better safe
202238384Sjkim; than sorry" principle makes me treat it specially. But I don't
203238384Sjkim; look for particular offending word, but rather for 64-byte input
204238384Sjkim; block which crosses the boundary. Once found that block is aligned
205238384Sjkim; and hashed separately...
206238384Sjkim.align	4
207238384SjkimLunaligned:
208238384Sjkim	subfic	$t1,$inp,4096
209238384Sjkim	andi.	$t1,$t1,4095	; distance to closest page boundary
210238384Sjkim	srwi.	$t1,$t1,6	; t1/=64
211238384Sjkim	beq	Lcross_page
212238384Sjkim	$UCMP	$num,$t1
213238384Sjkim	ble-	Laligned	; didn't cross the page boundary
214238384Sjkim	mtctr	$t1
215238384Sjkim	subfc	$num,$t1,$num
216238384Sjkim	bl	Lsha1_block_private
217238384SjkimLcross_page:
218238384Sjkim	li	$t1,16
219238384Sjkim	mtctr	$t1
220238384Sjkim	addi	r20,$sp,$LOCALS	; spot within the frame
221238384SjkimLmemcpy:
222238384Sjkim	lbz	r16,0($inp)
223238384Sjkim	lbz	r17,1($inp)
224238384Sjkim	lbz	r18,2($inp)
225238384Sjkim	lbz	r19,3($inp)
226238384Sjkim	addi	$inp,$inp,4
227238384Sjkim	stb	r16,0(r20)
228238384Sjkim	stb	r17,1(r20)
229238384Sjkim	stb	r18,2(r20)
230238384Sjkim	stb	r19,3(r20)
231238384Sjkim	addi	r20,r20,4
232238384Sjkim	bdnz	Lmemcpy
233238384Sjkim
234238384Sjkim	$PUSH	$inp,`$FRAME-$SIZE_T*18`($sp)
235238384Sjkim	li	$t1,1
236238384Sjkim	addi	$inp,$sp,$LOCALS
237238384Sjkim	mtctr	$t1
238238384Sjkim	bl	Lsha1_block_private
239238384Sjkim	$POP	$inp,`$FRAME-$SIZE_T*18`($sp)
240238384Sjkim	addic.	$num,$num,-1
241238384Sjkim	bne-	Lunaligned
242238384Sjkim
243238384SjkimLdone:
244238384Sjkim	$POP	r0,`$FRAME+$LRSAVE`($sp)
245238384Sjkim	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
246238384Sjkim	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
247238384Sjkim	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
248238384Sjkim	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
249238384Sjkim	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
250238384Sjkim	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
251238384Sjkim	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
252238384Sjkim	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
253238384Sjkim	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
254238384Sjkim	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
255238384Sjkim	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
256238384Sjkim	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
257238384Sjkim	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
258238384Sjkim	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
259238384Sjkim	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
260238384Sjkim	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
261238384Sjkim	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
262238384Sjkim	mtlr	r0
263238384Sjkim	addi	$sp,$sp,$FRAME
264238384Sjkim	blr
265238384Sjkim	.long	0
266238384Sjkim	.byte	0,12,4,1,0x80,18,3,0
267238384Sjkim	.long	0
268238384Sjkim___
269238384Sjkim
270238384Sjkim# This is private block function, which uses tailored calling
271238384Sjkim# interface, namely upon entry SHA_CTX is pre-loaded to given
272238384Sjkim# registers and counter register contains amount of chunks to
273238384Sjkim# digest...
274238384Sjkim$code.=<<___;
275238384Sjkim.align	4
276238384SjkimLsha1_block_private:
277238384Sjkim___
278238384Sjkim$code.=<<___;	# load K_00_19
279238384Sjkim	lis	$K,0x5a82
280238384Sjkim	ori	$K,$K,0x7999
281238384Sjkim___
282238384Sjkimfor($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
283238384Sjkim$code.=<<___;	# load K_20_39
284238384Sjkim	lis	$K,0x6ed9
285238384Sjkim	ori	$K,$K,0xeba1
286238384Sjkim___
287238384Sjkimfor(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
288238384Sjkim$code.=<<___;	# load K_40_59
289238384Sjkim	lis	$K,0x8f1b
290238384Sjkim	ori	$K,$K,0xbcdc
291238384Sjkim___
292238384Sjkimfor(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
293238384Sjkim$code.=<<___;	# load K_60_79
294238384Sjkim	lis	$K,0xca62
295238384Sjkim	ori	$K,$K,0xc1d6
296238384Sjkim___
297238384Sjkimfor(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
298238384Sjkim$code.=<<___;
299238384Sjkim	add	r16,r16,$E
300238384Sjkim	add	r17,r17,$T
301238384Sjkim	add	r18,r18,$A
302238384Sjkim	add	r19,r19,$B
303238384Sjkim	add	r20,r20,$C
304238384Sjkim	stw	r16,0($ctx)
305238384Sjkim	mr	$A,r16
306238384Sjkim	stw	r17,4($ctx)
307238384Sjkim	mr	$B,r17
308238384Sjkim	stw	r18,8($ctx)
309238384Sjkim	mr	$C,r18
310238384Sjkim	stw	r19,12($ctx)
311238384Sjkim	mr	$D,r19
312238384Sjkim	stw	r20,16($ctx)
313238384Sjkim	mr	$E,r20
314238384Sjkim	addi	$inp,$inp,`16*4`
315238384Sjkim	bdnz-	Lsha1_block_private
316238384Sjkim	blr
317238384Sjkim	.long	0
318238384Sjkim	.byte	0,12,0x14,0,0,0,0,0
319238384Sjkim___
320238384Sjkim$code.=<<___;
321238384Sjkim.asciz	"SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
322238384Sjkim___
323238384Sjkim
324238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem;
325238384Sjkimprint $code;
326238384Sjkimclose STDOUT;
327