1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim#
10238384Sjkim# March 2010
11238384Sjkim#
12238384Sjkim# The module implements "4-bit" GCM GHASH function and underlying
13238384Sjkim# single multiplication operation in GF(2^128). "4-bit" means that it
14238384Sjkim# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
15238384Sjkim# GHASH performance was measured to be 6.67 cycles per processed byte
16238384Sjkim# on Itanium 2, which is >90% better than Microsoft compiler generated
17238384Sjkim# code. To anchor to something else sha1-ia64.pl module processes one
18238384Sjkim# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
19238384Sjkim# byte.
20238384Sjkim
21238384Sjkim# September 2010
22238384Sjkim#
23238384Sjkim# It was originally thought that it makes lesser sense to implement
24238384Sjkim# "528B" variant on Itanium 2 for following reason. Because number of
25238384Sjkim# functional units is naturally limited, it appeared impossible to
26238384Sjkim# implement "528B" loop in 4 cycles, only in 5. This would mean that
27238384Sjkim# theoretically performance improvement couldn't be more than 20%.
28238384Sjkim# But occasionally you prove yourself wrong:-) I figured out a way to
29238384Sjkim# fold couple of instructions and having freed yet another instruction
30238384Sjkim# slot by unrolling the loop... Resulting performance is 4.45 cycles
31238384Sjkim# per processed byte and 50% better than "256B" version. On original
32238384Sjkim# Itanium performance should remain the same as the "256B" version,
33238384Sjkim# i.e. ~8.5 cycles.
34238384Sjkim
35238384Sjkim$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
36238384Sjkim
37238384Sjkimif ($^O eq "hpux") {
38238384Sjkim    $ADDP="addp4";
39238384Sjkim    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
40238384Sjkim} else { $ADDP="add"; }
41238384Sjkimfor (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
42238384Sjkim                $big_endian=0 if (/\-DL_ENDIAN/);  }
43238384Sjkimif (!defined($big_endian))
44238384Sjkim             {  $big_endian=(unpack('L',pack('N',1))==1);  }
45238384Sjkim
46238384Sjkimsub loop() {
47238384Sjkimmy $label=shift;
48238384Sjkimmy ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
49238384Sjkim
50238384Sjkim# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
51238384Sjkim# in scalable manner;-) Naturally assuming data in L1 cache...
52238384Sjkim# Special note about 'dep' instruction, which is used to construct
53238384Sjkim# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
54238384Sjkim# bytes boundary and lower 7 bits of its address are guaranteed to
55238384Sjkim# be zero.
56238384Sjkim$code.=<<___;
57238384Sjkim$label:
58238384Sjkim{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
59238384Sjkim	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
60238384Sjkim{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
61238384Sjkim	($p17)	xor	xi[1]=xi[1],in[1]	};;
62238384Sjkim{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
63238384Sjkim	(p19)	shrp	Zlo=Zhi,Zlo,4		}
64238384Sjkim{ .mfi;	(p19)	ld8	rem=[rem]
65238384Sjkim	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
66238384Sjkim{ .mmi;	($p16)	ld1	in[0]=[inp],-1
67238384Sjkim	(p18)	xor	Zlo=Zlo,Hlo
68238384Sjkim	(p19)	shr.u	Zhi=Zhi,4		}
69238384Sjkim{ .mib;	(p19)	xor	Hhi=Hhi,rem
70238384Sjkim	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
71238384Sjkim
72238384Sjkim{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
73238384Sjkim	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
74238384Sjkim{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
75238384Sjkim	(p18)	xor	Zhi=Zhi,Hhi		};;
76238384Sjkim{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
77238384Sjkim	(p18)	shrp	Zlo=Zhi,Zlo,4		}
78238384Sjkim{ .mfi;	(p18)	ld8	rem=[rem]
79238384Sjkim	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
80238384Sjkim{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
81238384Sjkim	(p18)	xor	Zlo=Zlo,Hlo
82238384Sjkim	(p18)	shr.u	Zhi=Zhi,4		}
83238384Sjkim{ .mib;	(p18)	xor	Hhi=Hhi,rem
84238384Sjkim	(p17)	add	Hi[0]=Htbl,Hi[0]
85238384Sjkim	br.ctop.sptk	$label			};;
86238384Sjkim___
87238384Sjkim}
88238384Sjkim
89238384Sjkim$code=<<___;
90238384Sjkim.explicit
91238384Sjkim.text
92238384Sjkim
93238384Sjkimprevfs=r2;	prevlc=r3;	prevpr=r8;
94238384Sjkimmask0xf0=r21;
95238384Sjkimrem=r22;	rem_4bitp=r23;
96238384SjkimXi=r24;		Htbl=r25;
97238384Sjkiminp=r26;	end=r27;
98238384SjkimHhi=r28;	Hlo=r29;
99238384SjkimZhi=r30;	Zlo=r31;
100238384Sjkim
101238384Sjkim.align	128
102238384Sjkim.skip	16					// aligns loop body
103238384Sjkim.global	gcm_gmult_4bit#
104238384Sjkim.proc	gcm_gmult_4bit#
105238384Sjkimgcm_gmult_4bit:
106238384Sjkim	.prologue
107238384Sjkim{ .mmi;	.save	ar.pfs,prevfs
108238384Sjkim	alloc	prevfs=ar.pfs,2,6,0,8
109238384Sjkim	$ADDP	Xi=15,in0			// &Xi[15]
110238384Sjkim	mov	rem_4bitp=ip		}
111238384Sjkim{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
112238384Sjkim	.save	ar.lc,prevlc
113238384Sjkim	mov	prevlc=ar.lc
114238384Sjkim	.save	pr,prevpr
115238384Sjkim	mov	prevpr=pr		};;
116238384Sjkim
117238384Sjkim	.body
118238384Sjkim	.rotr	in[3],xi[3],Hi[2]
119238384Sjkim
120238384Sjkim{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
121238384Sjkim	mov	mask0xf0=0xf0
122238384Sjkim	brp.loop.imp	.Loop1,.Lend1-16};;
123238384Sjkim{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
124238384Sjkim					};;
125238384Sjkim{ .mii;	shladd	Hi[1]=xi[2],4,r0
126238384Sjkim	mov	pr.rot=0x7<<16
127238384Sjkim	mov	ar.lc=13		};;
128238384Sjkim{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
129238384Sjkim	mov	ar.ec=3
130238384Sjkim	xor	Zlo=Zlo,Zlo		};;
131238384Sjkim{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
132238384Sjkim	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
133238384Sjkim	xor	Zhi=Zhi,Zhi		};;
134238384Sjkim___
135238384Sjkim	&loop	(".Loop1",1);
136238384Sjkim$code.=<<___;
137238384Sjkim.Lend1:
138238384Sjkim{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
139238384Sjkim{ .mib;	mux1	Zlo=Zlo,\@rev		};;
140238384Sjkim{ .mib;	mux1	Zhi=Zhi,\@rev		};;
141238384Sjkim{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
142238384Sjkim	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
143238384Sjkim{ .mib;	st8	[Hlo]=Zlo
144238384Sjkim	mov	pr=prevpr,0x1ffff	};;
145238384Sjkim{ .mib;	st8	[Hhi]=Zhi
146238384Sjkim	mov	ar.lc=prevlc
147238384Sjkim	br.ret.sptk.many	b0	};;
148238384Sjkim.endp	gcm_gmult_4bit#
149238384Sjkim___
150238384Sjkim
151238384Sjkim######################################################################
152238384Sjkim# "528B" (well, "512B" actualy) streamed GHASH
153238384Sjkim#
154238384Sjkim$Xip="in0";
155238384Sjkim$Htbl="in1";
156238384Sjkim$inp="in2";
157238384Sjkim$len="in3";
158238384Sjkim$rem_8bit="loc0";
159238384Sjkim$mask0xff="loc1";
160238384Sjkim($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
161238384Sjkim
162238384Sjkimsub load_htable() {
163238384Sjkim    for (my $i=0;$i<8;$i++) {
164238384Sjkim	$code.=<<___;
165238384Sjkim{ .mmi;	ld8	r`16+2*$i+1`=[r8],16		// Htable[$i].hi
166238384Sjkim	ld8	r`16+2*$i`=[r9],16	}	// Htable[$i].lo
167238384Sjkim{ .mmi;	ldf8	f`32+2*$i+1`=[r10],16		// Htable[`8+$i`].hi
168238384Sjkim	ldf8	f`32+2*$i`=[r11],16		// Htable[`8+$i`].lo
169238384Sjkim___
170238384Sjkim	$code.=shift	if (($i+$#_)==7);
171238384Sjkim	$code.="\t};;\n"
172238384Sjkim    }
173238384Sjkim}
174238384Sjkim
175238384Sjkim$code.=<<___;
176238384Sjkimprevsp=r3;
177238384Sjkim
178238384Sjkim.align	32
179238384Sjkim.skip	16					// aligns loop body
180238384Sjkim.global	gcm_ghash_4bit#
181238384Sjkim.proc	gcm_ghash_4bit#
182238384Sjkimgcm_ghash_4bit:
183238384Sjkim	.prologue
184238384Sjkim{ .mmi;	.save	ar.pfs,prevfs
185238384Sjkim	alloc	prevfs=ar.pfs,4,2,0,0
186238384Sjkim	.vframe	prevsp
187238384Sjkim	mov	prevsp=sp
188238384Sjkim	mov	$rem_8bit=ip		};;
189238384Sjkim	.body
190238384Sjkim{ .mfi;	$ADDP	r8=0+0,$Htbl
191238384Sjkim	$ADDP	r9=0+8,$Htbl		}
192238384Sjkim{ .mfi;	$ADDP	r10=128+0,$Htbl
193238384Sjkim	$ADDP	r11=128+8,$Htbl		};;
194238384Sjkim___
195238384Sjkim	&load_htable(
196238384Sjkim	"	$ADDP	$Xip=15,$Xip",		# &Xi[15]
197238384Sjkim	"	$ADDP	$len=$len,$inp",	# &inp[len]
198238384Sjkim	"	$ADDP	$inp=15,$inp",		# &inp[15]
199238384Sjkim	"	mov	$mask0xff=0xff",
200238384Sjkim	"	add	sp=-512,sp",
201238384Sjkim	"	andcm	sp=sp,$mask0xff",	# align stack frame
202238384Sjkim	"	add	r14=0,sp",
203238384Sjkim	"	add	r15=8,sp");
204238384Sjkim$code.=<<___;
205238384Sjkim{ .mmi;	$sum	1<<1				// go big-endian
206238384Sjkim	add	r8=256+0,sp
207238384Sjkim	add	r9=256+8,sp		}
208238384Sjkim{ .mmi;	add	r10=256+128+0,sp
209238384Sjkim	add	r11=256+128+8,sp
210238384Sjkim	add	$len=-17,$len		};;
211238384Sjkim___
212238384Sjkimfor($i=0;$i<8;$i++) {	# generate first half of Hshr4[]
213238384Sjkimmy ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
214238384Sjkim$code.=<<___;
215238384Sjkim{ .mmi;	st8	[r8]=$rlo,16			// Htable[$i].lo
216238384Sjkim	st8	[r9]=$rhi,16			// Htable[$i].hi
217238384Sjkim	shrp	$rlo=$rhi,$rlo,4	}//;;
218238384Sjkim{ .mmi;	stf8	[r10]=f`32+2*$i`,16		// Htable[`8+$i`].lo
219238384Sjkim	stf8	[r11]=f`32+2*$i+1`,16		// Htable[`8+$i`].hi
220238384Sjkim	shr.u	$rhi=$rhi,4		};;
221238384Sjkim{ .mmi;	st8	[r14]=$rlo,16			// Htable[$i].lo>>4
222238384Sjkim	st8	[r15]=$rhi,16		}//;;	// Htable[$i].hi>>4
223238384Sjkim___
224238384Sjkim}
225238384Sjkim$code.=<<___;
226238384Sjkim{ .mmi;	ld8	r16=[r8],16			// Htable[8].lo
227238384Sjkim	ld8	r17=[r9],16		};;	// Htable[8].hi
228238384Sjkim{ .mmi;	ld8	r18=[r8],16			// Htable[9].lo
229238384Sjkim	ld8	r19=[r9],16		}	// Htable[9].hi
230238384Sjkim{ .mmi;	rum	1<<5				// clear um.mfh
231238384Sjkim	shrp	r16=r17,r16,4		};;
232238384Sjkim___
233238384Sjkimfor($i=0;$i<6;$i++) {	# generate second half of Hshr4[]
234238384Sjkim$code.=<<___;
235238384Sjkim{ .mmi;	ld8	r`20+2*$i`=[r8],16		// Htable[`10+$i`].lo
236238384Sjkim	ld8	r`20+2*$i+1`=[r9],16		// Htable[`10+$i`].hi
237238384Sjkim	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
238238384Sjkim{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
239238384Sjkim	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
240238384Sjkim	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
241238384Sjkim___
242238384Sjkim}
243238384Sjkim$code.=<<___;
244238384Sjkim{ .mmi;	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
245238384Sjkim{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
246238384Sjkim	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
247238384Sjkim	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
248238384Sjkim{ .mmi;	add	$Htbl=256,sp			// &Htable[0]
249238384Sjkim	add	$rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
250238384Sjkim	shr.u	r`18+2*$i+1`=r`18+2*$i+1`,4	};;
251238384Sjkim{ .mmi;	st8	[r14]=r`18+2*$i`		// Htable[`8+$i`].lo>>4
252238384Sjkim	st8	[r15]=r`18+2*$i+1`	}	// Htable[`8+$i`].hi>>4
253238384Sjkim___
254238384Sjkim
255238384Sjkim$in="r15";
256238384Sjkim@xi=("r16","r17");
257238384Sjkim@rem=("r18","r19");
258238384Sjkim($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
259238384Sjkim($Atbl,$Btbl)=("r26","r27");
260238384Sjkim
261238384Sjkim$code.=<<___;	# (p16)
262238384Sjkim{ .mmi;	ld1	$in=[$inp],-1			//(p16) *inp--
263238384Sjkim	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
264238384Sjkim	cmp.eq	p0,p6=r0,r0		};;	//	clear p6
265238384Sjkim___
266238384Sjkimpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
267238384Sjkim
268238384Sjkim$code.=<<___;	# (p16),(p17)
269238384Sjkim{ .mmi;	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
270238384Sjkim	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
271238384Sjkim{ .mii;	ld1	$in=[$inp],-1			//(p16) *inp--
272238384Sjkim	dep	$Atbl=$xi[1],$Htbl,4,4		//(p17) &Htable[nlo].lo
273238384Sjkim	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
274238384Sjkim.align	32
275238384Sjkim.LOOP:
276238384Sjkim{ .mmi;
277238384Sjkim(p6)	st8	[$Xip]=$Zhi,13
278238384Sjkim	xor	$Zlo=$Zlo,$Zlo
279238384Sjkim	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi].lo
280238384Sjkim___
281238384Sjkimpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
282238384Sjkim
283238384Sjkim$code.=<<___;	# (p16),(p17),(p18)
284238384Sjkim{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
285238384Sjkim	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
286238384Sjkim	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
287238384Sjkim{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
288238384Sjkim	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
289238384Sjkim{ .mfi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
290238384Sjkim	xor	$Zlo=$Zlo,$Alo		};;	//(p18) Z.lo^=Htable[nlo].lo
291238384Sjkim{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
292238384Sjkim	ld1	$in=[$inp],-1		}	//(p16) *inp--
293238384Sjkim{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
294238384Sjkim	mov	$Zhi=$Ahi			//(p18) Z.hi^=Htable[nlo].hi
295238384Sjkim	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
296238384Sjkim{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
297238384Sjkim	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
298238384Sjkim	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
299238384Sjkim{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
300238384Sjkim	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
301238384Sjkim___
302238384Sjkimpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
303238384Sjkim
304238384Sjkimfor ($i=1;$i<14;$i++) {
305238384Sjkim# Above and below fragments are derived from this one by removing
306238384Sjkim# unsuitable (p??) instructions.
307238384Sjkim$code.=<<___;	# (p16),(p17),(p18),(p19)
308238384Sjkim{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
309238384Sjkim	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
310238384Sjkim	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
311238384Sjkim{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
312238384Sjkim	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
313238384Sjkim	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
314238384Sjkim{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
315238384Sjkim	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
316238384Sjkim	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
317238384Sjkim{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
318238384Sjkim	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
319238384Sjkim	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
320238384Sjkim{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
321238384Sjkim	ld1	$in=[$inp],-1			//(p16) *inp--
322238384Sjkim	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
323238384Sjkim{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
324238384Sjkim	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
325238384Sjkim	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
326238384Sjkim{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
327238384Sjkim	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
328238384Sjkim	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
329238384Sjkim{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
330238384Sjkim	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
331238384Sjkim	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
332238384Sjkim___
333238384Sjkimpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
334238384Sjkim}
335238384Sjkim
336238384Sjkim$code.=<<___;	# (p17),(p18),(p19)
337238384Sjkim{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
338238384Sjkim	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
339238384Sjkim	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
340238384Sjkim{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
341238384Sjkim	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
342238384Sjkim	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
343238384Sjkim{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
344238384Sjkim	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
345238384Sjkim	dep	$Atbl=$xi[1],$Htbl,4,4	};;	//(p17) &Htable[nlo].lo
346238384Sjkim{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
347238384Sjkim	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
348238384Sjkim	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
349238384Sjkim{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
350238384Sjkim	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
351238384Sjkim{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
352238384Sjkim	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
353238384Sjkim	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
354238384Sjkim{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
355238384Sjkim	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
356238384Sjkim{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
357238384Sjkim	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
358238384Sjkim	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
359238384Sjkim___
360238384Sjkimpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
361238384Sjkim
362238384Sjkim$code.=<<___;	# (p18),(p19)
363238384Sjkim{ .mfi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
364238384Sjkim	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
365238384Sjkim{ .mfi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
366238384Sjkim	xor	$Zlo=$Zlo,$Blo		};;	//(p19) Z.lo^=Hshr4[nhi].lo
367238384Sjkim{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
368238384Sjkim	xor	$Zlo=$Zlo,$Alo		}	//(p18) Z.lo^=Htable[nlo].lo
369238384Sjkim{ .mfi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
370238384Sjkim	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
371238384Sjkim{ .mfi;	ld8	$Blo=[$Btbl],8			//(p18) Htable[nhi].lo,&Htable[nhi].hi
372238384Sjkim	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
373238384Sjkim{ .mfi;	shladd	$rem[0]=$Zlo,4,r0		//(p18) Z.lo<<4
374238384Sjkim	xor	$Zhi=$Zhi,$Ahi		};;	//(p18) Z.hi^=Htable[nlo].hi
375238384Sjkim{ .mfi;	ld8	$Bhi=[$Btbl]			//(p18) Htable[nhi].hi
376238384Sjkim	shrp	$Zlo=$Zhi,$Zlo,4	}	//(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
377238384Sjkim{ .mfi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
378238384Sjkim	xor	$Zhi=$Zhi,$rem[1]	};;	//(p19) Z.hi^=rem_8bit[rem]<<48
379238384Sjkim___
380238384Sjkimpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
381238384Sjkim
382238384Sjkim$code.=<<___;	# (p19)
383238384Sjkim{ .mmi;	cmp.ltu	p6,p0=$inp,$len
384238384Sjkim	add	$inp=32,$inp
385238384Sjkim	shr.u	$Zhi=$Zhi,4		}	//(p19) Z.hi>>=4
386238384Sjkim{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
387238384Sjkim	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
388238384Sjkim	add	$Xip=9,$Xip		};;	//	&Xi.lo
389238384Sjkim{ .mmi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
390238384Sjkim(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
391238384Sjkim(p6)	extr.u	$xi[1]=$Zlo,8,8		}	//[p17] Xi[14]
392238384Sjkim{ .mmi;	xor	$Zhi=$Zhi,$Bhi			//(p19) Z.hi^=Hshr4[nhi].hi
393238384Sjkim(p6)	and	$xi[0]=$Zlo,$mask0xff	};;	//[p16] Xi[15]
394238384Sjkim{ .mmi;	st8	[$Xip]=$Zlo,-8
395238384Sjkim(p6)	xor	$xi[0]=$xi[0],$in		//[p17] xi=$xi[i]^inp[i]
396238384Sjkim	shl	$rem[1]=$rem[1],48	};;	//(p19) rem_8bit[rem]<<48
397238384Sjkim{ .mmi;
398238384Sjkim(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
399238384Sjkim	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
400238384Sjkim(p6)	dep	$Atbl=$xi[0],$Htbl,4,4	}	//[p17] &Htable[nlo].lo
401238384Sjkim{ .mib;
402238384Sjkim(p6)	and	$xi[0]=-16,$xi[0]		//[p17] nhi=xi&0xf0
403238384Sjkim(p6)	br.cond.dptk.many	.LOOP	};;
404238384Sjkim
405238384Sjkim{ .mib;	st8	[$Xip]=$Zhi		};;
406238384Sjkim{ .mib;	$rum	1<<1				// return to little-endian
407238384Sjkim	.restore	sp
408238384Sjkim	mov	sp=prevsp
409238384Sjkim	br.ret.sptk.many	b0	};;
410238384Sjkim.endp	gcm_ghash_4bit#
411238384Sjkim___
412238384Sjkim$code.=<<___;
413238384Sjkim.align	128
414238384Sjkim.type	rem_4bit#,\@object
415238384Sjkimrem_4bit:
416238384Sjkim        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
417238384Sjkim        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
418238384Sjkim        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
419238384Sjkim        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
420238384Sjkim.size	rem_4bit#,128
421238384Sjkim.type	rem_8bit#,\@object
422238384Sjkimrem_8bit:
423238384Sjkim	data1	0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
424238384Sjkim	data1	0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
425238384Sjkim	data1	0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
426238384Sjkim	data1	0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
427238384Sjkim	data1	0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
428238384Sjkim	data1	0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
429238384Sjkim	data1	0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
430238384Sjkim	data1	0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
431238384Sjkim	data1	0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
432238384Sjkim	data1	0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
433238384Sjkim	data1	0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
434238384Sjkim	data1	0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
435238384Sjkim	data1	0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
436238384Sjkim	data1	0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
437238384Sjkim	data1	0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
438238384Sjkim	data1	0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
439238384Sjkim	data1	0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
440238384Sjkim	data1	0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
441238384Sjkim	data1	0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
442238384Sjkim	data1	0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
443238384Sjkim	data1	0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
444238384Sjkim	data1	0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
445238384Sjkim	data1	0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
446238384Sjkim	data1	0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
447238384Sjkim	data1	0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
448238384Sjkim	data1	0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
449238384Sjkim	data1	0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
450238384Sjkim	data1	0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
451238384Sjkim	data1	0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
452238384Sjkim	data1	0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
453238384Sjkim	data1	0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
454238384Sjkim	data1	0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
455238384Sjkim.size	rem_8bit#,512
456238384Sjkimstringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
457238384Sjkim___
458238384Sjkim
459238384Sjkim$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
460238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem;
461238384Sjkim
462238384Sjkimprint $code;
463238384Sjkimclose STDOUT;
464