1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# On PA-7100LC this module performs ~90-50% better, less for longer 11238384Sjkim# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means 12238384Sjkim# that compiler utilized xmpyu instruction to perform 32x32=64-bit 13238384Sjkim# multiplication, which in turn means that "baseline" performance was 14238384Sjkim# optimal in respect to instruction set capabilities. Fair comparison 15238384Sjkim# with vendor compiler is problematic, because OpenSSL doesn't define 16238384Sjkim# BN_LLONG [presumably] for historical reasons, which drives compiler 17238384Sjkim# toward 4 times 16x16=32-bit multiplicatons [plus complementary 18238384Sjkim# shifts and additions] instead. This means that you should observe 19238384Sjkim# several times improvement over code generated by vendor compiler 20238384Sjkim# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual 21238384Sjkim# improvement coefficient was never collected on PA-7100LC, or any 22238384Sjkim# other 1.1 CPU, because I don't have access to such machine with 23238384Sjkim# vendor compiler. But to give you a taste, PA-RISC 1.1 code path 24238384Sjkim# reportedly outperformed code generated by cc +DA1.1 +O3 by factor 25238384Sjkim# of ~5x on PA-8600. 26238384Sjkim# 27238384Sjkim# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is 28238384Sjkim# reportedly ~2x faster than vendor compiler generated code [according 29238384Sjkim# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of 30238384Sjkim# this implementation is actually 32-bit one, in the sense that it 31238384Sjkim# operates on 32-bit values. But pa-risc2[W].s operates on arrays of 32238384Sjkim# 64-bit BN_LONGs... How do they interoperate then? No problem. This 33238384Sjkim# module picks halves of 64-bit values in reverse order and pretends 34238384Sjkim# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" 35238384Sjkim# 64-bit code such as pa-risc2[W].s then? Well, the thing is that 36238384Sjkim# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, 37238384Sjkim# i.e. there is no "wider" multiplication like on most other 64-bit 38238384Sjkim# platforms. This means that even being effectively 32-bit, this 39238384Sjkim# implementation performs "64-bit" computational task in same amount 40238384Sjkim# of arithmetic operations, most notably multiplications. It requires 41238384Sjkim# more memory references, most notably to tp[num], but this doesn't 42238384Sjkim# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC 43264331Sjkim# 2.0 code path provides virtually same performance as pa-risc2[W].s: 44238384Sjkim# it's ~10% better for shortest key length and ~10% worse for longest 45238384Sjkim# one. 46238384Sjkim# 47238384Sjkim# In case it wasn't clear. The module has two distinct code paths: 48238384Sjkim# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit 49238384Sjkim# additions and 64-bit integer loads, not to mention specific 50238384Sjkim# instruction scheduling. In 64-bit build naturally only 2.0 code path 51238384Sjkim# is assembled. In 32-bit application context both code paths are 52238384Sjkim# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path 53238384Sjkim# is taken automatically. Also, in 32-bit build the module imposes 54238384Sjkim# couple of limitations: vector lengths has to be even and vector 55238384Sjkim# addresses has to be 64-bit aligned. Normally neither is a problem: 56238384Sjkim# most common key lengths are even and vectors are commonly malloc-ed, 57238384Sjkim# which ensures alignment. 58238384Sjkim# 59238384Sjkim# Special thanks to polarhome.com for providing HP-UX account on 60238384Sjkim# PA-RISC 1.1 machine, and to correspondent who chose to remain 61238384Sjkim# anonymous for testing the code on PA-RISC 2.0 machine. 62238384Sjkim 63238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 64238384Sjkim 65238384Sjkim$flavour = shift; 66238384Sjkim$output = shift; 67238384Sjkim 68238384Sjkimopen STDOUT,">$output"; 69238384Sjkim 70238384Sjkimif ($flavour =~ /64/) { 71238384Sjkim $LEVEL ="2.0W"; 72238384Sjkim $SIZE_T =8; 73238384Sjkim $FRAME_MARKER =80; 74238384Sjkim $SAVED_RP =16; 75238384Sjkim $PUSH ="std"; 76238384Sjkim $PUSHMA ="std,ma"; 77238384Sjkim $POP ="ldd"; 78238384Sjkim $POPMB ="ldd,mb"; 79238384Sjkim $BN_SZ =$SIZE_T; 80238384Sjkim} else { 81238384Sjkim $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; 82238384Sjkim $SIZE_T =4; 83238384Sjkim $FRAME_MARKER =48; 84238384Sjkim $SAVED_RP =20; 85238384Sjkim $PUSH ="stw"; 86238384Sjkim $PUSHMA ="stwm"; 87238384Sjkim $POP ="ldw"; 88238384Sjkim $POPMB ="ldwm"; 89238384Sjkim $BN_SZ =$SIZE_T; 90238384Sjkim if (open CONF,"<${dir}../../opensslconf.h") { 91238384Sjkim while(<CONF>) { 92238384Sjkim if (m/#\s*define\s+SIXTY_FOUR_BIT/) { 93238384Sjkim $BN_SZ=8; 94238384Sjkim $LEVEL="2.0"; 95238384Sjkim last; 96238384Sjkim } 97238384Sjkim } 98238384Sjkim close CONF; 99238384Sjkim } 100238384Sjkim} 101238384Sjkim 102238384Sjkim$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker 103238384Sjkim # [+ argument transfer] 104238384Sjkim$LOCALS=$FRAME-$FRAME_MARKER; 105238384Sjkim$FRAME+=32; # local variables 106238384Sjkim 107238384Sjkim$tp="%r31"; 108238384Sjkim$ti1="%r29"; 109238384Sjkim$ti0="%r28"; 110238384Sjkim 111238384Sjkim$rp="%r26"; 112238384Sjkim$ap="%r25"; 113238384Sjkim$bp="%r24"; 114238384Sjkim$np="%r23"; 115238384Sjkim$n0="%r22"; # passed through stack in 32-bit 116238384Sjkim$num="%r21"; # passed through stack in 32-bit 117238384Sjkim$idx="%r20"; 118238384Sjkim$arrsz="%r19"; 119238384Sjkim 120238384Sjkim$nm1="%r7"; 121238384Sjkim$nm0="%r6"; 122238384Sjkim$ab1="%r5"; 123238384Sjkim$ab0="%r4"; 124238384Sjkim 125238384Sjkim$fp="%r3"; 126238384Sjkim$hi1="%r2"; 127238384Sjkim$hi0="%r1"; 128238384Sjkim 129238384Sjkim$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s 130238384Sjkim 131238384Sjkim$fm0="%fr4"; $fti=$fm0; 132238384Sjkim$fbi="%fr5L"; 133238384Sjkim$fn0="%fr5R"; 134238384Sjkim$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; 135238384Sjkim$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; 136238384Sjkim 137238384Sjkim$code=<<___; 138238384Sjkim .LEVEL $LEVEL 139238384Sjkim .SPACE \$TEXT\$ 140238384Sjkim .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 141238384Sjkim 142238384Sjkim .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 143238384Sjkim .ALIGN 64 144238384Sjkimbn_mul_mont 145238384Sjkim .PROC 146238384Sjkim .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 147238384Sjkim .ENTRY 148238384Sjkim $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 149238384Sjkim $PUSHMA %r3,$FRAME(%sp) 150238384Sjkim $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 151238384Sjkim $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 152238384Sjkim $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 153238384Sjkim $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 154238384Sjkim $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 155238384Sjkim $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 156238384Sjkim $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 157238384Sjkim ldo -$FRAME(%sp),$fp 158238384Sjkim___ 159238384Sjkim$code.=<<___ if ($SIZE_T==4); 160238384Sjkim ldw `-$FRAME_MARKER-4`($fp),$n0 161238384Sjkim ldw `-$FRAME_MARKER-8`($fp),$num 162238384Sjkim nop 163238384Sjkim nop ; alignment 164238384Sjkim___ 165238384Sjkim$code.=<<___ if ($BN_SZ==4); 166238384Sjkim comiclr,<= 6,$num,%r0 ; are vectors long enough? 167238384Sjkim b L\$abort 168238384Sjkim ldi 0,%r28 ; signal "unhandled" 169238384Sjkim add,ev %r0,$num,$num ; is $num even? 170238384Sjkim b L\$abort 171238384Sjkim nop 172238384Sjkim or $ap,$np,$ti1 173238384Sjkim extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? 174238384Sjkim b L\$abort 175238384Sjkim nop 176238384Sjkim nop ; alignment 177238384Sjkim nop 178238384Sjkim 179238384Sjkim fldws 0($n0),${fn0} 180238384Sjkim fldws,ma 4($bp),${fbi} ; bp[0] 181238384Sjkim___ 182238384Sjkim$code.=<<___ if ($BN_SZ==8); 183238384Sjkim comib,> 3,$num,L\$abort ; are vectors long enough? 184238384Sjkim ldi 0,%r28 ; signal "unhandled" 185238384Sjkim addl $num,$num,$num ; I operate on 32-bit values 186238384Sjkim 187238384Sjkim fldws 4($n0),${fn0} ; only low part of n0 188238384Sjkim fldws 4($bp),${fbi} ; bp[0] in flipped word order 189238384Sjkim___ 190238384Sjkim$code.=<<___; 191238384Sjkim fldds 0($ap),${fai} ; ap[0,1] 192238384Sjkim fldds 0($np),${fni} ; np[0,1] 193238384Sjkim 194238384Sjkim sh2addl $num,%r0,$arrsz 195238384Sjkim ldi 31,$hi0 196238384Sjkim ldo 36($arrsz),$hi1 ; space for tp[num+1] 197238384Sjkim andcm $hi1,$hi0,$hi1 ; align 198238384Sjkim addl $hi1,%sp,%sp 199238384Sjkim $PUSH $fp,-$SIZE_T(%sp) 200238384Sjkim 201238384Sjkim ldo `$LOCALS+16`($fp),$xfer 202238384Sjkim ldo `$LOCALS+32+4`($fp),$tp 203238384Sjkim 204238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] 205238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] 206238384Sjkim xmpyu ${fn0},${fab0}R,${fm0} 207238384Sjkim 208238384Sjkim addl $arrsz,$ap,$ap ; point at the end 209238384Sjkim addl $arrsz,$np,$np 210238384Sjkim subi 0,$arrsz,$idx ; j=0 211238384Sjkim ldo 8($idx),$idx ; j++++ 212238384Sjkim 213238384Sjkim xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 214238384Sjkim xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 215238384Sjkim fstds ${fab0},-16($xfer) 216238384Sjkim fstds ${fnm0},-8($xfer) 217238384Sjkim fstds ${fab1},0($xfer) 218238384Sjkim fstds ${fnm1},8($xfer) 219238384Sjkim flddx $idx($ap),${fai} ; ap[2,3] 220238384Sjkim flddx $idx($np),${fni} ; np[2,3] 221238384Sjkim___ 222238384Sjkim$code.=<<___ if ($BN_SZ==4); 223238384Sjkim mtctl $hi0,%cr11 ; $hi0 still holds 31 224238384Sjkim extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 225238384Sjkim b L\$parisc11 226238384Sjkim nop 227238384Sjkim___ 228238384Sjkim$code.=<<___; # PA-RISC 2.0 code-path 229238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 230238384Sjkim xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 231238384Sjkim ldd -16($xfer),$ab0 232238384Sjkim fstds ${fab0},-16($xfer) 233238384Sjkim 234238384Sjkim extrd,u $ab0,31,32,$hi0 235238384Sjkim extrd,u $ab0,63,32,$ab0 236238384Sjkim ldd -8($xfer),$nm0 237238384Sjkim fstds ${fnm0},-8($xfer) 238238384Sjkim ldo 8($idx),$idx ; j++++ 239238384Sjkim addl $ab0,$nm0,$nm0 ; low part is discarded 240238384Sjkim extrd,u $nm0,31,32,$hi1 241238384Sjkim 242238384SjkimL\$1st 243238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 244238384Sjkim xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 245238384Sjkim ldd 0($xfer),$ab1 246238384Sjkim fstds ${fab1},0($xfer) 247238384Sjkim addl $hi0,$ab1,$ab1 248238384Sjkim extrd,u $ab1,31,32,$hi0 249238384Sjkim ldd 8($xfer),$nm1 250238384Sjkim fstds ${fnm1},8($xfer) 251238384Sjkim extrd,u $ab1,63,32,$ab1 252238384Sjkim addl $hi1,$nm1,$nm1 253238384Sjkim flddx $idx($ap),${fai} ; ap[j,j+1] 254238384Sjkim flddx $idx($np),${fni} ; np[j,j+1] 255238384Sjkim addl $ab1,$nm1,$nm1 256238384Sjkim extrd,u $nm1,31,32,$hi1 257238384Sjkim 258238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 259238384Sjkim xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 260238384Sjkim ldd -16($xfer),$ab0 261238384Sjkim fstds ${fab0},-16($xfer) 262238384Sjkim addl $hi0,$ab0,$ab0 263238384Sjkim extrd,u $ab0,31,32,$hi0 264238384Sjkim ldd -8($xfer),$nm0 265238384Sjkim fstds ${fnm0},-8($xfer) 266238384Sjkim extrd,u $ab0,63,32,$ab0 267238384Sjkim addl $hi1,$nm0,$nm0 268238384Sjkim stw $nm1,-4($tp) ; tp[j-1] 269238384Sjkim addl $ab0,$nm0,$nm0 270238384Sjkim stw,ma $nm0,8($tp) ; tp[j-1] 271238384Sjkim addib,<> 8,$idx,L\$1st ; j++++ 272238384Sjkim extrd,u $nm0,31,32,$hi1 273238384Sjkim 274238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 275238384Sjkim xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 276238384Sjkim ldd 0($xfer),$ab1 277238384Sjkim fstds ${fab1},0($xfer) 278238384Sjkim addl $hi0,$ab1,$ab1 279238384Sjkim extrd,u $ab1,31,32,$hi0 280238384Sjkim ldd 8($xfer),$nm1 281238384Sjkim fstds ${fnm1},8($xfer) 282238384Sjkim extrd,u $ab1,63,32,$ab1 283238384Sjkim addl $hi1,$nm1,$nm1 284238384Sjkim ldd -16($xfer),$ab0 285238384Sjkim addl $ab1,$nm1,$nm1 286238384Sjkim ldd -8($xfer),$nm0 287238384Sjkim extrd,u $nm1,31,32,$hi1 288238384Sjkim 289238384Sjkim addl $hi0,$ab0,$ab0 290238384Sjkim extrd,u $ab0,31,32,$hi0 291238384Sjkim stw $nm1,-4($tp) ; tp[j-1] 292238384Sjkim extrd,u $ab0,63,32,$ab0 293238384Sjkim addl $hi1,$nm0,$nm0 294238384Sjkim ldd 0($xfer),$ab1 295238384Sjkim addl $ab0,$nm0,$nm0 296238384Sjkim ldd,mb 8($xfer),$nm1 297238384Sjkim extrd,u $nm0,31,32,$hi1 298238384Sjkim stw,ma $nm0,8($tp) ; tp[j-1] 299238384Sjkim 300238384Sjkim ldo -1($num),$num ; i-- 301238384Sjkim subi 0,$arrsz,$idx ; j=0 302238384Sjkim___ 303238384Sjkim$code.=<<___ if ($BN_SZ==4); 304238384Sjkim fldws,ma 4($bp),${fbi} ; bp[1] 305238384Sjkim___ 306238384Sjkim$code.=<<___ if ($BN_SZ==8); 307238384Sjkim fldws 0($bp),${fbi} ; bp[1] in flipped word order 308238384Sjkim___ 309238384Sjkim$code.=<<___; 310238384Sjkim flddx $idx($ap),${fai} ; ap[0,1] 311238384Sjkim flddx $idx($np),${fni} ; np[0,1] 312238384Sjkim fldws 8($xfer),${fti}R ; tp[0] 313238384Sjkim addl $hi0,$ab1,$ab1 314238384Sjkim extrd,u $ab1,31,32,$hi0 315238384Sjkim extrd,u $ab1,63,32,$ab1 316238384Sjkim ldo 8($idx),$idx ; j++++ 317238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 318238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 319238384Sjkim addl $hi1,$nm1,$nm1 320238384Sjkim addl $ab1,$nm1,$nm1 321238384Sjkim extrd,u $nm1,31,32,$hi1 322238384Sjkim fstws,mb ${fab0}L,-8($xfer) ; save high part 323238384Sjkim stw $nm1,-4($tp) ; tp[j-1] 324238384Sjkim 325238384Sjkim fcpy,sgl %fr0,${fti}L ; zero high part 326238384Sjkim fcpy,sgl %fr0,${fab0}L 327238384Sjkim addl $hi1,$hi0,$hi0 328238384Sjkim extrd,u $hi0,31,32,$hi1 329238384Sjkim fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 330238384Sjkim fcnvxf,dbl,dbl ${fab0},${fab0} 331238384Sjkim stw $hi0,0($tp) 332238384Sjkim stw $hi1,4($tp) 333238384Sjkim 334238384Sjkim fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 335238384Sjkim fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 336238384Sjkim xmpyu ${fn0},${fab0}R,${fm0} 337238384Sjkim ldo `$LOCALS+32+4`($fp),$tp 338238384SjkimL\$outer 339238384Sjkim xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 340238384Sjkim xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 341238384Sjkim fstds ${fab0},-16($xfer) ; 33-bit value 342238384Sjkim fstds ${fnm0},-8($xfer) 343238384Sjkim flddx $idx($ap),${fai} ; ap[2] 344238384Sjkim flddx $idx($np),${fni} ; np[2] 345238384Sjkim ldo 8($idx),$idx ; j++++ 346238384Sjkim ldd -16($xfer),$ab0 ; 33-bit value 347238384Sjkim ldd -8($xfer),$nm0 348238384Sjkim ldw 0($xfer),$hi0 ; high part 349238384Sjkim 350238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 351238384Sjkim xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 352238384Sjkim extrd,u $ab0,31,32,$ti0 ; carry bit 353238384Sjkim extrd,u $ab0,63,32,$ab0 354238384Sjkim fstds ${fab1},0($xfer) 355238384Sjkim addl $ti0,$hi0,$hi0 ; account carry bit 356238384Sjkim fstds ${fnm1},8($xfer) 357238384Sjkim addl $ab0,$nm0,$nm0 ; low part is discarded 358238384Sjkim ldw 0($tp),$ti1 ; tp[1] 359238384Sjkim extrd,u $nm0,31,32,$hi1 360238384Sjkim fstds ${fab0},-16($xfer) 361238384Sjkim fstds ${fnm0},-8($xfer) 362238384Sjkim 363238384SjkimL\$inner 364238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 365238384Sjkim xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 366238384Sjkim ldd 0($xfer),$ab1 367238384Sjkim fstds ${fab1},0($xfer) 368238384Sjkim addl $hi0,$ti1,$ti1 369238384Sjkim addl $ti1,$ab1,$ab1 370238384Sjkim ldd 8($xfer),$nm1 371238384Sjkim fstds ${fnm1},8($xfer) 372238384Sjkim extrd,u $ab1,31,32,$hi0 373238384Sjkim extrd,u $ab1,63,32,$ab1 374238384Sjkim flddx $idx($ap),${fai} ; ap[j,j+1] 375238384Sjkim flddx $idx($np),${fni} ; np[j,j+1] 376238384Sjkim addl $hi1,$nm1,$nm1 377238384Sjkim addl $ab1,$nm1,$nm1 378238384Sjkim ldw 4($tp),$ti0 ; tp[j] 379238384Sjkim stw $nm1,-4($tp) ; tp[j-1] 380238384Sjkim 381238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 382238384Sjkim xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 383238384Sjkim ldd -16($xfer),$ab0 384238384Sjkim fstds ${fab0},-16($xfer) 385238384Sjkim addl $hi0,$ti0,$ti0 386238384Sjkim addl $ti0,$ab0,$ab0 387238384Sjkim ldd -8($xfer),$nm0 388238384Sjkim fstds ${fnm0},-8($xfer) 389238384Sjkim extrd,u $ab0,31,32,$hi0 390238384Sjkim extrd,u $nm1,31,32,$hi1 391238384Sjkim ldw 8($tp),$ti1 ; tp[j] 392238384Sjkim extrd,u $ab0,63,32,$ab0 393238384Sjkim addl $hi1,$nm0,$nm0 394238384Sjkim addl $ab0,$nm0,$nm0 395238384Sjkim stw,ma $nm0,8($tp) ; tp[j-1] 396238384Sjkim addib,<> 8,$idx,L\$inner ; j++++ 397238384Sjkim extrd,u $nm0,31,32,$hi1 398238384Sjkim 399238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 400238384Sjkim xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 401238384Sjkim ldd 0($xfer),$ab1 402238384Sjkim fstds ${fab1},0($xfer) 403238384Sjkim addl $hi0,$ti1,$ti1 404238384Sjkim addl $ti1,$ab1,$ab1 405238384Sjkim ldd 8($xfer),$nm1 406238384Sjkim fstds ${fnm1},8($xfer) 407238384Sjkim extrd,u $ab1,31,32,$hi0 408238384Sjkim extrd,u $ab1,63,32,$ab1 409238384Sjkim ldw 4($tp),$ti0 ; tp[j] 410238384Sjkim addl $hi1,$nm1,$nm1 411238384Sjkim addl $ab1,$nm1,$nm1 412238384Sjkim ldd -16($xfer),$ab0 413238384Sjkim ldd -8($xfer),$nm0 414238384Sjkim extrd,u $nm1,31,32,$hi1 415238384Sjkim 416238384Sjkim addl $hi0,$ab0,$ab0 417238384Sjkim addl $ti0,$ab0,$ab0 418238384Sjkim stw $nm1,-4($tp) ; tp[j-1] 419238384Sjkim extrd,u $ab0,31,32,$hi0 420238384Sjkim ldw 8($tp),$ti1 ; tp[j] 421238384Sjkim extrd,u $ab0,63,32,$ab0 422238384Sjkim addl $hi1,$nm0,$nm0 423238384Sjkim ldd 0($xfer),$ab1 424238384Sjkim addl $ab0,$nm0,$nm0 425238384Sjkim ldd,mb 8($xfer),$nm1 426238384Sjkim extrd,u $nm0,31,32,$hi1 427238384Sjkim stw,ma $nm0,8($tp) ; tp[j-1] 428238384Sjkim 429238384Sjkim addib,= -1,$num,L\$outerdone ; i-- 430238384Sjkim subi 0,$arrsz,$idx ; j=0 431238384Sjkim___ 432238384Sjkim$code.=<<___ if ($BN_SZ==4); 433238384Sjkim fldws,ma 4($bp),${fbi} ; bp[i] 434238384Sjkim___ 435238384Sjkim$code.=<<___ if ($BN_SZ==8); 436238384Sjkim ldi 12,$ti0 ; bp[i] in flipped word order 437238384Sjkim addl,ev %r0,$num,$num 438238384Sjkim ldi -4,$ti0 439238384Sjkim addl $ti0,$bp,$bp 440238384Sjkim fldws 0($bp),${fbi} 441238384Sjkim___ 442238384Sjkim$code.=<<___; 443238384Sjkim flddx $idx($ap),${fai} ; ap[0] 444238384Sjkim addl $hi0,$ab1,$ab1 445238384Sjkim flddx $idx($np),${fni} ; np[0] 446238384Sjkim fldws 8($xfer),${fti}R ; tp[0] 447238384Sjkim addl $ti1,$ab1,$ab1 448238384Sjkim extrd,u $ab1,31,32,$hi0 449238384Sjkim extrd,u $ab1,63,32,$ab1 450238384Sjkim 451238384Sjkim ldo 8($idx),$idx ; j++++ 452238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 453238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 454238384Sjkim ldw 4($tp),$ti0 ; tp[j] 455238384Sjkim 456238384Sjkim addl $hi1,$nm1,$nm1 457238384Sjkim fstws,mb ${fab0}L,-8($xfer) ; save high part 458238384Sjkim addl $ab1,$nm1,$nm1 459238384Sjkim extrd,u $nm1,31,32,$hi1 460238384Sjkim fcpy,sgl %fr0,${fti}L ; zero high part 461238384Sjkim fcpy,sgl %fr0,${fab0}L 462238384Sjkim stw $nm1,-4($tp) ; tp[j-1] 463238384Sjkim 464238384Sjkim fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 465238384Sjkim fcnvxf,dbl,dbl ${fab0},${fab0} 466238384Sjkim addl $hi1,$hi0,$hi0 467238384Sjkim fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 468238384Sjkim addl $ti0,$hi0,$hi0 469238384Sjkim extrd,u $hi0,31,32,$hi1 470238384Sjkim fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 471238384Sjkim stw $hi0,0($tp) 472238384Sjkim stw $hi1,4($tp) 473238384Sjkim xmpyu ${fn0},${fab0}R,${fm0} 474238384Sjkim 475238384Sjkim b L\$outer 476238384Sjkim ldo `$LOCALS+32+4`($fp),$tp 477238384Sjkim 478238384SjkimL\$outerdone 479238384Sjkim addl $hi0,$ab1,$ab1 480238384Sjkim addl $ti1,$ab1,$ab1 481238384Sjkim extrd,u $ab1,31,32,$hi0 482238384Sjkim extrd,u $ab1,63,32,$ab1 483238384Sjkim 484238384Sjkim ldw 4($tp),$ti0 ; tp[j] 485238384Sjkim 486238384Sjkim addl $hi1,$nm1,$nm1 487238384Sjkim addl $ab1,$nm1,$nm1 488238384Sjkim extrd,u $nm1,31,32,$hi1 489238384Sjkim stw $nm1,-4($tp) ; tp[j-1] 490238384Sjkim 491238384Sjkim addl $hi1,$hi0,$hi0 492238384Sjkim addl $ti0,$hi0,$hi0 493238384Sjkim extrd,u $hi0,31,32,$hi1 494238384Sjkim stw $hi0,0($tp) 495238384Sjkim stw $hi1,4($tp) 496238384Sjkim 497238384Sjkim ldo `$LOCALS+32`($fp),$tp 498238384Sjkim sub %r0,%r0,%r0 ; clear borrow 499238384Sjkim___ 500238384Sjkim$code.=<<___ if ($BN_SZ==4); 501238384Sjkim ldws,ma 4($tp),$ti0 502238384Sjkim extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? 503238384Sjkim b L\$sub_pa11 504238384Sjkim addl $tp,$arrsz,$tp 505238384SjkimL\$sub 506238384Sjkim ldwx $idx($np),$hi0 507238384Sjkim subb $ti0,$hi0,$hi1 508238384Sjkim ldwx $idx($tp),$ti0 509238384Sjkim addib,<> 4,$idx,L\$sub 510238384Sjkim stws,ma $hi1,4($rp) 511238384Sjkim 512238384Sjkim subb $ti0,%r0,$hi1 513238384Sjkim ldo -4($tp),$tp 514238384Sjkim___ 515238384Sjkim$code.=<<___ if ($BN_SZ==8); 516238384Sjkim ldd,ma 8($tp),$ti0 517238384SjkimL\$sub 518238384Sjkim ldd $idx($np),$hi0 519238384Sjkim shrpd $ti0,$ti0,32,$ti0 ; flip word order 520238384Sjkim std $ti0,-8($tp) ; save flipped value 521238384Sjkim sub,db $ti0,$hi0,$hi1 522238384Sjkim ldd,ma 8($tp),$ti0 523238384Sjkim addib,<> 8,$idx,L\$sub 524238384Sjkim std,ma $hi1,8($rp) 525238384Sjkim 526238384Sjkim extrd,u $ti0,31,32,$ti0 ; carry in flipped word order 527238384Sjkim sub,db $ti0,%r0,$hi1 528238384Sjkim ldo -8($tp),$tp 529238384Sjkim___ 530238384Sjkim$code.=<<___; 531238384Sjkim and $tp,$hi1,$ap 532238384Sjkim andcm $rp,$hi1,$bp 533238384Sjkim or $ap,$bp,$np 534238384Sjkim 535238384Sjkim sub $rp,$arrsz,$rp ; rewind rp 536238384Sjkim subi 0,$arrsz,$idx 537238384Sjkim ldo `$LOCALS+32`($fp),$tp 538238384SjkimL\$copy 539238384Sjkim ldd $idx($np),$hi0 540238384Sjkim std,ma %r0,8($tp) 541238384Sjkim addib,<> 8,$idx,.-8 ; L\$copy 542238384Sjkim std,ma $hi0,8($rp) 543238384Sjkim___ 544238384Sjkim 545238384Sjkimif ($BN_SZ==4) { # PA-RISC 1.1 code-path 546238384Sjkim$ablo=$ab0; 547238384Sjkim$abhi=$ab1; 548238384Sjkim$nmlo0=$nm0; 549238384Sjkim$nmhi0=$nm1; 550238384Sjkim$nmlo1="%r9"; 551238384Sjkim$nmhi1="%r8"; 552238384Sjkim 553238384Sjkim$code.=<<___; 554238384Sjkim b L\$done 555238384Sjkim nop 556238384Sjkim 557238384Sjkim .ALIGN 8 558238384SjkimL\$parisc11 559238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 560238384Sjkim xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 561238384Sjkim ldw -12($xfer),$ablo 562238384Sjkim ldw -16($xfer),$hi0 563238384Sjkim ldw -4($xfer),$nmlo0 564238384Sjkim ldw -8($xfer),$nmhi0 565238384Sjkim fstds ${fab0},-16($xfer) 566238384Sjkim fstds ${fnm0},-8($xfer) 567238384Sjkim 568238384Sjkim ldo 8($idx),$idx ; j++++ 569238384Sjkim add $ablo,$nmlo0,$nmlo0 ; discarded 570238384Sjkim addc %r0,$nmhi0,$hi1 571238384Sjkim ldw 4($xfer),$ablo 572238384Sjkim ldw 0($xfer),$abhi 573238384Sjkim nop 574238384Sjkim 575238384SjkimL\$1st_pa11 576238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 577238384Sjkim flddx $idx($ap),${fai} ; ap[j,j+1] 578238384Sjkim xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 579238384Sjkim flddx $idx($np),${fni} ; np[j,j+1] 580238384Sjkim add $hi0,$ablo,$ablo 581238384Sjkim ldw 12($xfer),$nmlo1 582238384Sjkim addc %r0,$abhi,$hi0 583238384Sjkim ldw 8($xfer),$nmhi1 584238384Sjkim add $ablo,$nmlo1,$nmlo1 585238384Sjkim fstds ${fab1},0($xfer) 586238384Sjkim addc %r0,$nmhi1,$nmhi1 587238384Sjkim fstds ${fnm1},8($xfer) 588238384Sjkim add $hi1,$nmlo1,$nmlo1 589238384Sjkim ldw -12($xfer),$ablo 590238384Sjkim addc %r0,$nmhi1,$hi1 591238384Sjkim ldw -16($xfer),$abhi 592238384Sjkim 593238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 594238384Sjkim ldw -4($xfer),$nmlo0 595238384Sjkim xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 596238384Sjkim ldw -8($xfer),$nmhi0 597238384Sjkim add $hi0,$ablo,$ablo 598238384Sjkim stw $nmlo1,-4($tp) ; tp[j-1] 599238384Sjkim addc %r0,$abhi,$hi0 600238384Sjkim fstds ${fab0},-16($xfer) 601238384Sjkim add $ablo,$nmlo0,$nmlo0 602238384Sjkim fstds ${fnm0},-8($xfer) 603238384Sjkim addc %r0,$nmhi0,$nmhi0 604238384Sjkim ldw 0($xfer),$abhi 605238384Sjkim add $hi1,$nmlo0,$nmlo0 606238384Sjkim ldw 4($xfer),$ablo 607238384Sjkim stws,ma $nmlo0,8($tp) ; tp[j-1] 608238384Sjkim addib,<> 8,$idx,L\$1st_pa11 ; j++++ 609238384Sjkim addc %r0,$nmhi0,$hi1 610238384Sjkim 611238384Sjkim ldw 8($xfer),$nmhi1 612238384Sjkim ldw 12($xfer),$nmlo1 613238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 614238384Sjkim xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 615238384Sjkim add $hi0,$ablo,$ablo 616238384Sjkim fstds ${fab1},0($xfer) 617238384Sjkim addc %r0,$abhi,$hi0 618238384Sjkim fstds ${fnm1},8($xfer) 619238384Sjkim add $ablo,$nmlo1,$nmlo1 620238384Sjkim ldw -16($xfer),$abhi 621238384Sjkim addc %r0,$nmhi1,$nmhi1 622238384Sjkim ldw -12($xfer),$ablo 623238384Sjkim add $hi1,$nmlo1,$nmlo1 624238384Sjkim ldw -8($xfer),$nmhi0 625238384Sjkim addc %r0,$nmhi1,$hi1 626238384Sjkim ldw -4($xfer),$nmlo0 627238384Sjkim 628238384Sjkim add $hi0,$ablo,$ablo 629238384Sjkim stw $nmlo1,-4($tp) ; tp[j-1] 630238384Sjkim addc %r0,$abhi,$hi0 631238384Sjkim ldw 0($xfer),$abhi 632238384Sjkim add $ablo,$nmlo0,$nmlo0 633238384Sjkim ldw 4($xfer),$ablo 634238384Sjkim addc %r0,$nmhi0,$nmhi0 635238384Sjkim ldws,mb 8($xfer),$nmhi1 636238384Sjkim add $hi1,$nmlo0,$nmlo0 637238384Sjkim ldw 4($xfer),$nmlo1 638238384Sjkim addc %r0,$nmhi0,$hi1 639238384Sjkim stws,ma $nmlo0,8($tp) ; tp[j-1] 640238384Sjkim 641238384Sjkim ldo -1($num),$num ; i-- 642238384Sjkim subi 0,$arrsz,$idx ; j=0 643238384Sjkim 644238384Sjkim fldws,ma 4($bp),${fbi} ; bp[1] 645238384Sjkim flddx $idx($ap),${fai} ; ap[0,1] 646238384Sjkim flddx $idx($np),${fni} ; np[0,1] 647238384Sjkim fldws 8($xfer),${fti}R ; tp[0] 648238384Sjkim add $hi0,$ablo,$ablo 649238384Sjkim addc %r0,$abhi,$hi0 650238384Sjkim ldo 8($idx),$idx ; j++++ 651238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 652238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 653238384Sjkim add $hi1,$nmlo1,$nmlo1 654238384Sjkim addc %r0,$nmhi1,$nmhi1 655238384Sjkim add $ablo,$nmlo1,$nmlo1 656238384Sjkim addc %r0,$nmhi1,$hi1 657238384Sjkim fstws,mb ${fab0}L,-8($xfer) ; save high part 658238384Sjkim stw $nmlo1,-4($tp) ; tp[j-1] 659238384Sjkim 660238384Sjkim fcpy,sgl %fr0,${fti}L ; zero high part 661238384Sjkim fcpy,sgl %fr0,${fab0}L 662238384Sjkim add $hi1,$hi0,$hi0 663238384Sjkim addc %r0,%r0,$hi1 664238384Sjkim fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 665238384Sjkim fcnvxf,dbl,dbl ${fab0},${fab0} 666238384Sjkim stw $hi0,0($tp) 667238384Sjkim stw $hi1,4($tp) 668238384Sjkim 669238384Sjkim fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 670238384Sjkim fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 671238384Sjkim xmpyu ${fn0},${fab0}R,${fm0} 672238384Sjkim ldo `$LOCALS+32+4`($fp),$tp 673238384SjkimL\$outer_pa11 674238384Sjkim xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 675238384Sjkim xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 676238384Sjkim fstds ${fab0},-16($xfer) ; 33-bit value 677238384Sjkim fstds ${fnm0},-8($xfer) 678238384Sjkim flddx $idx($ap),${fai} ; ap[2,3] 679238384Sjkim flddx $idx($np),${fni} ; np[2,3] 680238384Sjkim ldw -16($xfer),$abhi ; carry bit actually 681238384Sjkim ldo 8($idx),$idx ; j++++ 682238384Sjkim ldw -12($xfer),$ablo 683238384Sjkim ldw -8($xfer),$nmhi0 684238384Sjkim ldw -4($xfer),$nmlo0 685238384Sjkim ldw 0($xfer),$hi0 ; high part 686238384Sjkim 687238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 688238384Sjkim xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 689238384Sjkim fstds ${fab1},0($xfer) 690238384Sjkim addl $abhi,$hi0,$hi0 ; account carry bit 691238384Sjkim fstds ${fnm1},8($xfer) 692238384Sjkim add $ablo,$nmlo0,$nmlo0 ; discarded 693238384Sjkim ldw 0($tp),$ti1 ; tp[1] 694238384Sjkim addc %r0,$nmhi0,$hi1 695238384Sjkim fstds ${fab0},-16($xfer) 696238384Sjkim fstds ${fnm0},-8($xfer) 697238384Sjkim ldw 4($xfer),$ablo 698238384Sjkim ldw 0($xfer),$abhi 699238384Sjkim 700238384SjkimL\$inner_pa11 701238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 702238384Sjkim flddx $idx($ap),${fai} ; ap[j,j+1] 703238384Sjkim xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 704238384Sjkim flddx $idx($np),${fni} ; np[j,j+1] 705238384Sjkim add $hi0,$ablo,$ablo 706238384Sjkim ldw 4($tp),$ti0 ; tp[j] 707238384Sjkim addc %r0,$abhi,$abhi 708238384Sjkim ldw 12($xfer),$nmlo1 709238384Sjkim add $ti1,$ablo,$ablo 710238384Sjkim ldw 8($xfer),$nmhi1 711238384Sjkim addc %r0,$abhi,$hi0 712238384Sjkim fstds ${fab1},0($xfer) 713238384Sjkim add $ablo,$nmlo1,$nmlo1 714238384Sjkim fstds ${fnm1},8($xfer) 715238384Sjkim addc %r0,$nmhi1,$nmhi1 716238384Sjkim ldw -12($xfer),$ablo 717238384Sjkim add $hi1,$nmlo1,$nmlo1 718238384Sjkim ldw -16($xfer),$abhi 719238384Sjkim addc %r0,$nmhi1,$hi1 720238384Sjkim 721238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 722238384Sjkim ldw 8($tp),$ti1 ; tp[j] 723238384Sjkim xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 724238384Sjkim ldw -4($xfer),$nmlo0 725238384Sjkim add $hi0,$ablo,$ablo 726238384Sjkim ldw -8($xfer),$nmhi0 727238384Sjkim addc %r0,$abhi,$abhi 728238384Sjkim stw $nmlo1,-4($tp) ; tp[j-1] 729238384Sjkim add $ti0,$ablo,$ablo 730238384Sjkim fstds ${fab0},-16($xfer) 731238384Sjkim addc %r0,$abhi,$hi0 732238384Sjkim fstds ${fnm0},-8($xfer) 733238384Sjkim add $ablo,$nmlo0,$nmlo0 734238384Sjkim ldw 4($xfer),$ablo 735238384Sjkim addc %r0,$nmhi0,$nmhi0 736238384Sjkim ldw 0($xfer),$abhi 737238384Sjkim add $hi1,$nmlo0,$nmlo0 738238384Sjkim stws,ma $nmlo0,8($tp) ; tp[j-1] 739238384Sjkim addib,<> 8,$idx,L\$inner_pa11 ; j++++ 740238384Sjkim addc %r0,$nmhi0,$hi1 741238384Sjkim 742238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 743238384Sjkim ldw 12($xfer),$nmlo1 744238384Sjkim xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 745238384Sjkim ldw 8($xfer),$nmhi1 746238384Sjkim add $hi0,$ablo,$ablo 747238384Sjkim ldw 4($tp),$ti0 ; tp[j] 748238384Sjkim addc %r0,$abhi,$abhi 749238384Sjkim fstds ${fab1},0($xfer) 750238384Sjkim add $ti1,$ablo,$ablo 751238384Sjkim fstds ${fnm1},8($xfer) 752238384Sjkim addc %r0,$abhi,$hi0 753238384Sjkim ldw -16($xfer),$abhi 754238384Sjkim add $ablo,$nmlo1,$nmlo1 755238384Sjkim ldw -12($xfer),$ablo 756238384Sjkim addc %r0,$nmhi1,$nmhi1 757238384Sjkim ldw -8($xfer),$nmhi0 758238384Sjkim add $hi1,$nmlo1,$nmlo1 759238384Sjkim ldw -4($xfer),$nmlo0 760238384Sjkim addc %r0,$nmhi1,$hi1 761238384Sjkim 762238384Sjkim add $hi0,$ablo,$ablo 763238384Sjkim stw $nmlo1,-4($tp) ; tp[j-1] 764238384Sjkim addc %r0,$abhi,$abhi 765238384Sjkim add $ti0,$ablo,$ablo 766238384Sjkim ldw 8($tp),$ti1 ; tp[j] 767238384Sjkim addc %r0,$abhi,$hi0 768238384Sjkim ldw 0($xfer),$abhi 769238384Sjkim add $ablo,$nmlo0,$nmlo0 770238384Sjkim ldw 4($xfer),$ablo 771238384Sjkim addc %r0,$nmhi0,$nmhi0 772238384Sjkim ldws,mb 8($xfer),$nmhi1 773238384Sjkim add $hi1,$nmlo0,$nmlo0 774238384Sjkim ldw 4($xfer),$nmlo1 775238384Sjkim addc %r0,$nmhi0,$hi1 776238384Sjkim stws,ma $nmlo0,8($tp) ; tp[j-1] 777238384Sjkim 778238384Sjkim addib,= -1,$num,L\$outerdone_pa11; i-- 779238384Sjkim subi 0,$arrsz,$idx ; j=0 780238384Sjkim 781238384Sjkim fldws,ma 4($bp),${fbi} ; bp[i] 782238384Sjkim flddx $idx($ap),${fai} ; ap[0] 783238384Sjkim add $hi0,$ablo,$ablo 784238384Sjkim addc %r0,$abhi,$abhi 785238384Sjkim flddx $idx($np),${fni} ; np[0] 786238384Sjkim fldws 8($xfer),${fti}R ; tp[0] 787238384Sjkim add $ti1,$ablo,$ablo 788238384Sjkim addc %r0,$abhi,$hi0 789238384Sjkim 790238384Sjkim ldo 8($idx),$idx ; j++++ 791238384Sjkim xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 792238384Sjkim xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 793238384Sjkim ldw 4($tp),$ti0 ; tp[j] 794238384Sjkim 795238384Sjkim add $hi1,$nmlo1,$nmlo1 796238384Sjkim addc %r0,$nmhi1,$nmhi1 797238384Sjkim fstws,mb ${fab0}L,-8($xfer) ; save high part 798238384Sjkim add $ablo,$nmlo1,$nmlo1 799238384Sjkim addc %r0,$nmhi1,$hi1 800238384Sjkim fcpy,sgl %fr0,${fti}L ; zero high part 801238384Sjkim fcpy,sgl %fr0,${fab0}L 802238384Sjkim stw $nmlo1,-4($tp) ; tp[j-1] 803238384Sjkim 804238384Sjkim fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 805238384Sjkim fcnvxf,dbl,dbl ${fab0},${fab0} 806238384Sjkim add $hi1,$hi0,$hi0 807238384Sjkim addc %r0,%r0,$hi1 808238384Sjkim fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 809238384Sjkim add $ti0,$hi0,$hi0 810238384Sjkim addc %r0,$hi1,$hi1 811238384Sjkim fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 812238384Sjkim stw $hi0,0($tp) 813238384Sjkim stw $hi1,4($tp) 814238384Sjkim xmpyu ${fn0},${fab0}R,${fm0} 815238384Sjkim 816238384Sjkim b L\$outer_pa11 817238384Sjkim ldo `$LOCALS+32+4`($fp),$tp 818238384Sjkim 819238384SjkimL\$outerdone_pa11 820238384Sjkim add $hi0,$ablo,$ablo 821238384Sjkim addc %r0,$abhi,$abhi 822238384Sjkim add $ti1,$ablo,$ablo 823238384Sjkim addc %r0,$abhi,$hi0 824238384Sjkim 825238384Sjkim ldw 4($tp),$ti0 ; tp[j] 826238384Sjkim 827238384Sjkim add $hi1,$nmlo1,$nmlo1 828238384Sjkim addc %r0,$nmhi1,$nmhi1 829238384Sjkim add $ablo,$nmlo1,$nmlo1 830238384Sjkim addc %r0,$nmhi1,$hi1 831238384Sjkim stw $nmlo1,-4($tp) ; tp[j-1] 832238384Sjkim 833238384Sjkim add $hi1,$hi0,$hi0 834238384Sjkim addc %r0,%r0,$hi1 835238384Sjkim add $ti0,$hi0,$hi0 836238384Sjkim addc %r0,$hi1,$hi1 837238384Sjkim stw $hi0,0($tp) 838238384Sjkim stw $hi1,4($tp) 839238384Sjkim 840238384Sjkim ldo `$LOCALS+32+4`($fp),$tp 841238384Sjkim sub %r0,%r0,%r0 ; clear borrow 842238384Sjkim ldw -4($tp),$ti0 843238384Sjkim addl $tp,$arrsz,$tp 844238384SjkimL\$sub_pa11 845238384Sjkim ldwx $idx($np),$hi0 846238384Sjkim subb $ti0,$hi0,$hi1 847238384Sjkim ldwx $idx($tp),$ti0 848238384Sjkim addib,<> 4,$idx,L\$sub_pa11 849238384Sjkim stws,ma $hi1,4($rp) 850238384Sjkim 851238384Sjkim subb $ti0,%r0,$hi1 852238384Sjkim ldo -4($tp),$tp 853238384Sjkim and $tp,$hi1,$ap 854238384Sjkim andcm $rp,$hi1,$bp 855238384Sjkim or $ap,$bp,$np 856238384Sjkim 857238384Sjkim sub $rp,$arrsz,$rp ; rewind rp 858238384Sjkim subi 0,$arrsz,$idx 859238384Sjkim ldo `$LOCALS+32`($fp),$tp 860238384SjkimL\$copy_pa11 861238384Sjkim ldwx $idx($np),$hi0 862238384Sjkim stws,ma %r0,4($tp) 863238384Sjkim addib,<> 4,$idx,L\$copy_pa11 864238384Sjkim stws,ma $hi0,4($rp) 865238384Sjkim 866238384Sjkim nop ; alignment 867238384SjkimL\$done 868238384Sjkim___ 869238384Sjkim} 870238384Sjkim 871238384Sjkim$code.=<<___; 872238384Sjkim ldi 1,%r28 ; signal "handled" 873238384Sjkim ldo $FRAME($fp),%sp ; destroy tp[num+1] 874238384Sjkim 875238384Sjkim $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 876238384Sjkim $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 877238384Sjkim $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 878238384Sjkim $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 879238384Sjkim $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 880238384Sjkim $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 881238384Sjkim $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 882238384Sjkim $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 883238384SjkimL\$abort 884238384Sjkim bv (%r2) 885238384Sjkim .EXIT 886238384Sjkim $POPMB -$FRAME(%sp),%r3 887238384Sjkim .PROCEND 888238384Sjkim .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 889238384Sjkim___ 890238384Sjkim 891238384Sjkim# Explicitly encode PA-RISC 2.0 instructions used in this module, so 892238384Sjkim# that it can be compiled with .LEVEL 1.0. It should be noted that I 893238384Sjkim# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 894238384Sjkim# directive... 895238384Sjkim 896238384Sjkimmy $ldd = sub { 897238384Sjkim my ($mod,$args) = @_; 898238384Sjkim my $orig = "ldd$mod\t$args"; 899238384Sjkim 900238384Sjkim if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 901238384Sjkim { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 902238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 903238384Sjkim } 904238384Sjkim elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 905238384Sjkim { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 906238384Sjkim $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 907238384Sjkim $opcode|=(1<<5) if ($mod =~ /^,m/); 908238384Sjkim $opcode|=(1<<13) if ($mod =~ /^,mb/); 909238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 910238384Sjkim } 911238384Sjkim else { "\t".$orig; } 912238384Sjkim}; 913238384Sjkim 914238384Sjkimmy $std = sub { 915238384Sjkim my ($mod,$args) = @_; 916238384Sjkim my $orig = "std$mod\t$args"; 917238384Sjkim 918238384Sjkim if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 919238384Sjkim { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); 920238384Sjkim $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset 921238384Sjkim $opcode|=(1<<5) if ($mod =~ /^,m/); 922238384Sjkim $opcode|=(1<<13) if ($mod =~ /^,mb/); 923238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 924238384Sjkim } 925238384Sjkim else { "\t".$orig; } 926238384Sjkim}; 927238384Sjkim 928238384Sjkimmy $extrd = sub { 929238384Sjkim my ($mod,$args) = @_; 930238384Sjkim my $orig = "extrd$mod\t$args"; 931238384Sjkim 932238384Sjkim # I only have ",u" completer, it's implicitly encoded... 933238384Sjkim if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 934238384Sjkim { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 935238384Sjkim my $len=32-$3; 936238384Sjkim $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 937238384Sjkim $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 938238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 939238384Sjkim } 940238384Sjkim elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 941238384Sjkim { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 942238384Sjkim my $len=32-$2; 943238384Sjkim $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 944238384Sjkim $opcode |= (1<<13) if ($mod =~ /,\**=/); 945238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 946238384Sjkim } 947238384Sjkim else { "\t".$orig; } 948238384Sjkim}; 949238384Sjkim 950238384Sjkimmy $shrpd = sub { 951238384Sjkim my ($mod,$args) = @_; 952238384Sjkim my $orig = "shrpd$mod\t$args"; 953238384Sjkim 954238384Sjkim if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 955238384Sjkim { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 956238384Sjkim my $cpos=63-$3; 957238384Sjkim $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 958238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 959238384Sjkim } 960238384Sjkim else { "\t".$orig; } 961238384Sjkim}; 962238384Sjkim 963238384Sjkimmy $sub = sub { 964238384Sjkim my ($mod,$args) = @_; 965238384Sjkim my $orig = "sub$mod\t$args"; 966238384Sjkim 967238384Sjkim if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { 968238384Sjkim my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; 969238384Sjkim $opcode|=(1<<10); # e1 970238384Sjkim $opcode|=(1<<8); # e2 971238384Sjkim $opcode|=(1<<5); # d 972238384Sjkim sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig 973238384Sjkim } 974238384Sjkim else { "\t".$orig; } 975238384Sjkim}; 976238384Sjkim 977238384Sjkimsub assemble { 978238384Sjkim my ($mnemonic,$mod,$args)=@_; 979238384Sjkim my $opcode = eval("\$$mnemonic"); 980238384Sjkim 981238384Sjkim ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 982238384Sjkim} 983238384Sjkim 984238384Sjkimforeach (split("\n",$code)) { 985238384Sjkim s/\`([^\`]*)\`/eval $1/ge; 986238384Sjkim # flip word order in 64-bit mode... 987238384Sjkim s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); 988238384Sjkim # assemble 2.0 instructions in 32-bit mode... 989238384Sjkim s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); 990238384Sjkim 991264331Sjkim s/\bbv\b/bve/gm if ($SIZE_T==8); 992264331Sjkim 993238384Sjkim print $_,"\n"; 994238384Sjkim} 995238384Sjkimclose STDOUT; 996