1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim# 10238384Sjkim# March 2010 11238384Sjkim# 12238384Sjkim# The module implements "4-bit" GCM GHASH function and underlying 13238384Sjkim# single multiplication operation in GF(2^128). "4-bit" means that it 14238384Sjkim# uses 256 bytes per-key table [+128 bytes shared table]. Streamed 15238384Sjkim# GHASH performance was measured to be 6.67 cycles per processed byte 16238384Sjkim# on Itanium 2, which is >90% better than Microsoft compiler generated 17238384Sjkim# code. To anchor to something else sha1-ia64.pl module processes one 18238384Sjkim# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per 19238384Sjkim# byte. 20238384Sjkim 21238384Sjkim# September 2010 22238384Sjkim# 23238384Sjkim# It was originally thought that it makes lesser sense to implement 24238384Sjkim# "528B" variant on Itanium 2 for following reason. Because number of 25238384Sjkim# functional units is naturally limited, it appeared impossible to 26238384Sjkim# implement "528B" loop in 4 cycles, only in 5. This would mean that 27238384Sjkim# theoretically performance improvement couldn't be more than 20%. 28238384Sjkim# But occasionally you prove yourself wrong:-) I figured out a way to 29238384Sjkim# fold couple of instructions and having freed yet another instruction 30238384Sjkim# slot by unrolling the loop... Resulting performance is 4.45 cycles 31238384Sjkim# per processed byte and 50% better than "256B" version. On original 32238384Sjkim# Itanium performance should remain the same as the "256B" version, 33238384Sjkim# i.e. ~8.5 cycles. 34238384Sjkim 35238384Sjkim$output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); 36238384Sjkim 37238384Sjkimif ($^O eq "hpux") { 38238384Sjkim $ADDP="addp4"; 39238384Sjkim for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 40238384Sjkim} else { $ADDP="add"; } 41238384Sjkimfor (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); 42238384Sjkim $big_endian=0 if (/\-DL_ENDIAN/); } 43238384Sjkimif (!defined($big_endian)) 44238384Sjkim { $big_endian=(unpack('L',pack('N',1))==1); } 45238384Sjkim 46238384Sjkimsub loop() { 47238384Sjkimmy $label=shift; 48238384Sjkimmy ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp 49238384Sjkim 50238384Sjkim# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. 51238384Sjkim# in scalable manner;-) Naturally assuming data in L1 cache... 52238384Sjkim# Special note about 'dep' instruction, which is used to construct 53238384Sjkim# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 54238384Sjkim# bytes boundary and lower 7 bits of its address are guaranteed to 55238384Sjkim# be zero. 56238384Sjkim$code.=<<___; 57238384Sjkim$label: 58238384Sjkim{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 59238384Sjkim (p19) dep rem=Zlo,rem_4bitp,3,4 } 60238384Sjkim{ .mfi; (p19) xor Zhi=Zhi,Hhi 61238384Sjkim ($p17) xor xi[1]=xi[1],in[1] };; 62238384Sjkim{ .mfi; (p18) ld8 Hhi=[Hi[1]] 63238384Sjkim (p19) shrp Zlo=Zhi,Zlo,4 } 64238384Sjkim{ .mfi; (p19) ld8 rem=[rem] 65238384Sjkim (p18) and Hi[1]=mask0xf0,xi[2] };; 66238384Sjkim{ .mmi; ($p16) ld1 in[0]=[inp],-1 67238384Sjkim (p18) xor Zlo=Zlo,Hlo 68238384Sjkim (p19) shr.u Zhi=Zhi,4 } 69238384Sjkim{ .mib; (p19) xor Hhi=Hhi,rem 70238384Sjkim (p18) add Hi[1]=Htbl,Hi[1] };; 71238384Sjkim 72238384Sjkim{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 73238384Sjkim (p18) dep rem=Zlo,rem_4bitp,3,4 } 74238384Sjkim{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0 75238384Sjkim (p18) xor Zhi=Zhi,Hhi };; 76238384Sjkim{ .mfi; (p18) ld8 Hhi=[Hi[1]] 77238384Sjkim (p18) shrp Zlo=Zhi,Zlo,4 } 78238384Sjkim{ .mfi; (p18) ld8 rem=[rem] 79238384Sjkim (p17) and Hi[0]=mask0xf0,Hi[0] };; 80238384Sjkim{ .mmi; (p16) ld1 xi[0]=[Xi],-1 81238384Sjkim (p18) xor Zlo=Zlo,Hlo 82238384Sjkim (p18) shr.u Zhi=Zhi,4 } 83238384Sjkim{ .mib; (p18) xor Hhi=Hhi,rem 84238384Sjkim (p17) add Hi[0]=Htbl,Hi[0] 85238384Sjkim br.ctop.sptk $label };; 86238384Sjkim___ 87238384Sjkim} 88238384Sjkim 89238384Sjkim$code=<<___; 90238384Sjkim.explicit 91238384Sjkim.text 92238384Sjkim 93238384Sjkimprevfs=r2; prevlc=r3; prevpr=r8; 94238384Sjkimmask0xf0=r21; 95238384Sjkimrem=r22; rem_4bitp=r23; 96238384SjkimXi=r24; Htbl=r25; 97238384Sjkiminp=r26; end=r27; 98238384SjkimHhi=r28; Hlo=r29; 99238384SjkimZhi=r30; Zlo=r31; 100238384Sjkim 101238384Sjkim.align 128 102238384Sjkim.skip 16 // aligns loop body 103238384Sjkim.global gcm_gmult_4bit# 104238384Sjkim.proc gcm_gmult_4bit# 105238384Sjkimgcm_gmult_4bit: 106238384Sjkim .prologue 107238384Sjkim{ .mmi; .save ar.pfs,prevfs 108238384Sjkim alloc prevfs=ar.pfs,2,6,0,8 109238384Sjkim $ADDP Xi=15,in0 // &Xi[15] 110238384Sjkim mov rem_4bitp=ip } 111238384Sjkim{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo 112238384Sjkim .save ar.lc,prevlc 113238384Sjkim mov prevlc=ar.lc 114238384Sjkim .save pr,prevpr 115238384Sjkim mov prevpr=pr };; 116238384Sjkim 117238384Sjkim .body 118238384Sjkim .rotr in[3],xi[3],Hi[2] 119238384Sjkim 120238384Sjkim{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15] 121238384Sjkim mov mask0xf0=0xf0 122238384Sjkim brp.loop.imp .Loop1,.Lend1-16};; 123238384Sjkim{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] 124238384Sjkim };; 125238384Sjkim{ .mii; shladd Hi[1]=xi[2],4,r0 126238384Sjkim mov pr.rot=0x7<<16 127238384Sjkim mov ar.lc=13 };; 128238384Sjkim{ .mii; and Hi[1]=mask0xf0,Hi[1] 129238384Sjkim mov ar.ec=3 130238384Sjkim xor Zlo=Zlo,Zlo };; 131238384Sjkim{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo 132238384Sjkim add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp 133238384Sjkim xor Zhi=Zhi,Zhi };; 134238384Sjkim___ 135238384Sjkim &loop (".Loop1",1); 136238384Sjkim$code.=<<___; 137238384Sjkim.Lend1: 138238384Sjkim{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact 139238384Sjkim{ .mib; mux1 Zlo=Zlo,\@rev };; 140238384Sjkim{ .mib; mux1 Zhi=Zhi,\@rev };; 141238384Sjkim{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent 142238384Sjkim add Hhi=1,Xi };; // pipeline flush on Itanium 143238384Sjkim{ .mib; st8 [Hlo]=Zlo 144238384Sjkim mov pr=prevpr,0x1ffff };; 145238384Sjkim{ .mib; st8 [Hhi]=Zhi 146238384Sjkim mov ar.lc=prevlc 147238384Sjkim br.ret.sptk.many b0 };; 148238384Sjkim.endp gcm_gmult_4bit# 149238384Sjkim___ 150238384Sjkim 151238384Sjkim###################################################################### 152238384Sjkim# "528B" (well, "512B" actualy) streamed GHASH 153238384Sjkim# 154238384Sjkim$Xip="in0"; 155238384Sjkim$Htbl="in1"; 156238384Sjkim$inp="in2"; 157238384Sjkim$len="in3"; 158238384Sjkim$rem_8bit="loc0"; 159238384Sjkim$mask0xff="loc1"; 160238384Sjkim($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum"); 161238384Sjkim 162238384Sjkimsub load_htable() { 163238384Sjkim for (my $i=0;$i<8;$i++) { 164238384Sjkim $code.=<<___; 165238384Sjkim{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi 166238384Sjkim ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo 167238384Sjkim{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi 168238384Sjkim ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo 169238384Sjkim___ 170238384Sjkim $code.=shift if (($i+$#_)==7); 171238384Sjkim $code.="\t};;\n" 172238384Sjkim } 173238384Sjkim} 174238384Sjkim 175238384Sjkim$code.=<<___; 176238384Sjkimprevsp=r3; 177238384Sjkim 178238384Sjkim.align 32 179238384Sjkim.skip 16 // aligns loop body 180238384Sjkim.global gcm_ghash_4bit# 181238384Sjkim.proc gcm_ghash_4bit# 182238384Sjkimgcm_ghash_4bit: 183238384Sjkim .prologue 184238384Sjkim{ .mmi; .save ar.pfs,prevfs 185238384Sjkim alloc prevfs=ar.pfs,4,2,0,0 186238384Sjkim .vframe prevsp 187238384Sjkim mov prevsp=sp 188238384Sjkim mov $rem_8bit=ip };; 189238384Sjkim .body 190238384Sjkim{ .mfi; $ADDP r8=0+0,$Htbl 191238384Sjkim $ADDP r9=0+8,$Htbl } 192238384Sjkim{ .mfi; $ADDP r10=128+0,$Htbl 193238384Sjkim $ADDP r11=128+8,$Htbl };; 194238384Sjkim___ 195238384Sjkim &load_htable( 196238384Sjkim " $ADDP $Xip=15,$Xip", # &Xi[15] 197238384Sjkim " $ADDP $len=$len,$inp", # &inp[len] 198238384Sjkim " $ADDP $inp=15,$inp", # &inp[15] 199238384Sjkim " mov $mask0xff=0xff", 200238384Sjkim " add sp=-512,sp", 201238384Sjkim " andcm sp=sp,$mask0xff", # align stack frame 202238384Sjkim " add r14=0,sp", 203238384Sjkim " add r15=8,sp"); 204238384Sjkim$code.=<<___; 205238384Sjkim{ .mmi; $sum 1<<1 // go big-endian 206238384Sjkim add r8=256+0,sp 207238384Sjkim add r9=256+8,sp } 208238384Sjkim{ .mmi; add r10=256+128+0,sp 209238384Sjkim add r11=256+128+8,sp 210238384Sjkim add $len=-17,$len };; 211238384Sjkim___ 212238384Sjkimfor($i=0;$i<8;$i++) { # generate first half of Hshr4[] 213238384Sjkimmy ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1)); 214238384Sjkim$code.=<<___; 215238384Sjkim{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo 216238384Sjkim st8 [r9]=$rhi,16 // Htable[$i].hi 217238384Sjkim shrp $rlo=$rhi,$rlo,4 }//;; 218238384Sjkim{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo 219238384Sjkim stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi 220238384Sjkim shr.u $rhi=$rhi,4 };; 221238384Sjkim{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4 222238384Sjkim st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4 223238384Sjkim___ 224238384Sjkim} 225238384Sjkim$code.=<<___; 226238384Sjkim{ .mmi; ld8 r16=[r8],16 // Htable[8].lo 227238384Sjkim ld8 r17=[r9],16 };; // Htable[8].hi 228238384Sjkim{ .mmi; ld8 r18=[r8],16 // Htable[9].lo 229238384Sjkim ld8 r19=[r9],16 } // Htable[9].hi 230238384Sjkim{ .mmi; rum 1<<5 // clear um.mfh 231238384Sjkim shrp r16=r17,r16,4 };; 232238384Sjkim___ 233238384Sjkimfor($i=0;$i<6;$i++) { # generate second half of Hshr4[] 234238384Sjkim$code.=<<___; 235238384Sjkim{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo 236238384Sjkim ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi 237238384Sjkim shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; 238238384Sjkim{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 239238384Sjkim st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 240238384Sjkim shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } 241238384Sjkim___ 242238384Sjkim} 243238384Sjkim$code.=<<___; 244238384Sjkim{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; 245238384Sjkim{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 246238384Sjkim st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 247238384Sjkim shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } 248238384Sjkim{ .mmi; add $Htbl=256,sp // &Htable[0] 249238384Sjkim add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit 250238384Sjkim shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };; 251238384Sjkim{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4 252238384Sjkim st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4 253238384Sjkim___ 254238384Sjkim 255238384Sjkim$in="r15"; 256238384Sjkim@xi=("r16","r17"); 257238384Sjkim@rem=("r18","r19"); 258238384Sjkim($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25"); 259238384Sjkim($Atbl,$Btbl)=("r26","r27"); 260238384Sjkim 261238384Sjkim$code.=<<___; # (p16) 262238384Sjkim{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp-- 263238384Sjkim ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 264238384Sjkim cmp.eq p0,p6=r0,r0 };; // clear p6 265238384Sjkim___ 266238384Sjkimpush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 267238384Sjkim 268238384Sjkim$code.=<<___; # (p16),(p17) 269238384Sjkim{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 270238384Sjkim xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 271238384Sjkim{ .mii; ld1 $in=[$inp],-1 //(p16) *inp-- 272238384Sjkim dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo 273238384Sjkim and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 274238384Sjkim.align 32 275238384Sjkim.LOOP: 276238384Sjkim{ .mmi; 277238384Sjkim(p6) st8 [$Xip]=$Zhi,13 278238384Sjkim xor $Zlo=$Zlo,$Zlo 279238384Sjkim add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo 280238384Sjkim___ 281238384Sjkimpush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 282238384Sjkim 283238384Sjkim$code.=<<___; # (p16),(p17),(p18) 284238384Sjkim{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 285238384Sjkim ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 286238384Sjkim xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 287238384Sjkim{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 288238384Sjkim dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo 289238384Sjkim{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 290238384Sjkim xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo 291238384Sjkim{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 292238384Sjkim ld1 $in=[$inp],-1 } //(p16) *inp-- 293238384Sjkim{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 294238384Sjkim mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi 295238384Sjkim and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 296238384Sjkim{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 297238384Sjkim ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 298238384Sjkim shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 299238384Sjkim{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 300238384Sjkim add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 301238384Sjkim___ 302238384Sjkimpush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 303238384Sjkim 304238384Sjkimfor ($i=1;$i<14;$i++) { 305238384Sjkim# Above and below fragments are derived from this one by removing 306238384Sjkim# unsuitable (p??) instructions. 307238384Sjkim$code.=<<___; # (p16),(p17),(p18),(p19) 308238384Sjkim{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 309238384Sjkim ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 310238384Sjkim shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 311238384Sjkim{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 312238384Sjkim xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 313238384Sjkim xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 314238384Sjkim{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 315238384Sjkim ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 316238384Sjkim dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo 317238384Sjkim{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 318238384Sjkim xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo 319238384Sjkim xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 320238384Sjkim{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 321238384Sjkim ld1 $in=[$inp],-1 //(p16) *inp-- 322238384Sjkim shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 323238384Sjkim{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 324238384Sjkim xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi 325238384Sjkim and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 326238384Sjkim{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 327238384Sjkim ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 328238384Sjkim shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 329238384Sjkim{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 330238384Sjkim xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 331238384Sjkim add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 332238384Sjkim___ 333238384Sjkimpush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 334238384Sjkim} 335238384Sjkim 336238384Sjkim$code.=<<___; # (p17),(p18),(p19) 337238384Sjkim{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 338238384Sjkim ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 339238384Sjkim shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 340238384Sjkim{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 341238384Sjkim xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 342238384Sjkim xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 343238384Sjkim{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 344238384Sjkim ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 345238384Sjkim dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo 346238384Sjkim{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 347238384Sjkim xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo 348238384Sjkim xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 349238384Sjkim{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 350238384Sjkim shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 351238384Sjkim{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 352238384Sjkim xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi 353238384Sjkim and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 354238384Sjkim{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 355238384Sjkim shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 356238384Sjkim{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 357238384Sjkim xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 358238384Sjkim add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 359238384Sjkim___ 360238384Sjkimpush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 361238384Sjkim 362238384Sjkim$code.=<<___; # (p18),(p19) 363238384Sjkim{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 364238384Sjkim shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 365238384Sjkim{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 366238384Sjkim xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo 367238384Sjkim{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 368238384Sjkim xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo 369238384Sjkim{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 370238384Sjkim xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 371238384Sjkim{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi 372238384Sjkim shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 373238384Sjkim{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4 374238384Sjkim xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi 375238384Sjkim{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi 376238384Sjkim shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4) 377238384Sjkim{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 378238384Sjkim xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48 379238384Sjkim___ 380238384Sjkimpush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 381238384Sjkim 382238384Sjkim$code.=<<___; # (p19) 383238384Sjkim{ .mmi; cmp.ltu p6,p0=$inp,$len 384238384Sjkim add $inp=32,$inp 385238384Sjkim shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4 386238384Sjkim{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 387238384Sjkim xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 388238384Sjkim add $Xip=9,$Xip };; // &Xi.lo 389238384Sjkim{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 390238384Sjkim(p6) ld1 $in=[$inp],-1 //[p16] *inp-- 391238384Sjkim(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14] 392238384Sjkim{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi 393238384Sjkim(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15] 394238384Sjkim{ .mmi; st8 [$Xip]=$Zlo,-8 395238384Sjkim(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i] 396238384Sjkim shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48 397238384Sjkim{ .mmi; 398238384Sjkim(p6) ld1 $in=[$inp],-1 //[p16] *inp-- 399238384Sjkim xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 400238384Sjkim(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo 401238384Sjkim{ .mib; 402238384Sjkim(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0 403238384Sjkim(p6) br.cond.dptk.many .LOOP };; 404238384Sjkim 405238384Sjkim{ .mib; st8 [$Xip]=$Zhi };; 406238384Sjkim{ .mib; $rum 1<<1 // return to little-endian 407238384Sjkim .restore sp 408238384Sjkim mov sp=prevsp 409238384Sjkim br.ret.sptk.many b0 };; 410238384Sjkim.endp gcm_ghash_4bit# 411238384Sjkim___ 412238384Sjkim$code.=<<___; 413238384Sjkim.align 128 414238384Sjkim.type rem_4bit#,\@object 415238384Sjkimrem_4bit: 416238384Sjkim data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 417238384Sjkim data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 418238384Sjkim data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 419238384Sjkim data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 420238384Sjkim.size rem_4bit#,128 421238384Sjkim.type rem_8bit#,\@object 422238384Sjkimrem_8bit: 423238384Sjkim data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E 424238384Sjkim data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E 425238384Sjkim data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E 426238384Sjkim data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E 427238384Sjkim data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E 428238384Sjkim data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E 429238384Sjkim data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E 430238384Sjkim data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E 431238384Sjkim data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE 432238384Sjkim data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE 433238384Sjkim data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE 434238384Sjkim data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE 435238384Sjkim data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E 436238384Sjkim data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E 437238384Sjkim data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE 438238384Sjkim data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE 439238384Sjkim data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E 440238384Sjkim data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E 441238384Sjkim data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E 442238384Sjkim data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E 443238384Sjkim data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E 444238384Sjkim data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E 445238384Sjkim data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E 446238384Sjkim data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E 447238384Sjkim data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE 448238384Sjkim data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE 449238384Sjkim data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE 450238384Sjkim data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE 451238384Sjkim data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E 452238384Sjkim data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E 453238384Sjkim data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE 454238384Sjkim data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE 455238384Sjkim.size rem_8bit#,512 456238384Sjkimstringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>" 457238384Sjkim___ 458238384Sjkim 459238384Sjkim$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); 460238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem; 461238384Sjkim 462238384Sjkimprint $code; 463238384Sjkimclose STDOUT; 464