1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# December 2007 11238384Sjkim 12238384Sjkim# The reason for undertaken effort is basically following. Even though 13238384Sjkim# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI 14238384Sjkim# performance was observed to be less than impressive, essentially as 15238384Sjkim# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. 16238384Sjkim# Well, it's not surprising that IBM had to make some sacrifices to 17238384Sjkim# boost the clock frequency that much, but no overall improvement? 18238384Sjkim# Having observed how much difference did switching to FPU make on 19238384Sjkim# UltraSPARC, playing same stunt on Power 6 appeared appropriate... 20238384Sjkim# Unfortunately the resulting performance improvement is not as 21238384Sjkim# impressive, ~30%, and in absolute terms is still very far from what 22238384Sjkim# one would expect from 4.7GHz CPU. There is a chance that I'm doing 23238384Sjkim# something wrong, but in the lack of assembler level micro-profiling 24238384Sjkim# data or at least decent platform guide I can't tell... Or better 25238384Sjkim# results might be achieved with VMX... Anyway, this module provides 26238384Sjkim# *worse* performance on other PowerPC implementations, ~40-15% slower 27238384Sjkim# on PPC970 depending on key length and ~40% slower on Power 5 for all 28238384Sjkim# key lengths. As it's obviously inappropriate as "best all-round" 29238384Sjkim# alternative, it has to be complemented with run-time CPU family 30238384Sjkim# detection. Oh! It should also be noted that unlike other PowerPC 31238384Sjkim# implementation IALU ppc-mont.pl module performs *suboptimaly* on 32238384Sjkim# >=1024-bit key lengths on Power 6. It should also be noted that 33238384Sjkim# *everything* said so far applies to 64-bit builds! As far as 32-bit 34238384Sjkim# application executed on 64-bit CPU goes, this module is likely to 35238384Sjkim# become preferred choice, because it's easy to adapt it for such 36238384Sjkim# case and *is* faster than 32-bit ppc-mont.pl on *all* processors. 37238384Sjkim 38238384Sjkim# February 2008 39238384Sjkim 40238384Sjkim# Micro-profiling assisted optimization results in ~15% improvement 41238384Sjkim# over original ppc64-mont.pl version, or overall ~50% improvement 42238384Sjkim# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same 43238384Sjkim# Power 6 CPU, this module is 5-150% faster depending on key length, 44238384Sjkim# [hereafter] more for longer keys. But if compared to ppc-mont.pl 45238384Sjkim# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive 46238384Sjkim# in absolute terms, but it's apparently the way Power 6 is... 47238384Sjkim 48238384Sjkim# December 2009 49238384Sjkim 50238384Sjkim# Adapted for 32-bit build this module delivers 25-120%, yes, more 51238384Sjkim# than *twice* for longer keys, performance improvement over 32-bit 52238384Sjkim# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes 53238384Sjkim# even 64-bit integer operations and the trouble is that most PPC 54238384Sjkim# operating systems don't preserve upper halves of general purpose 55238384Sjkim# registers upon 32-bit signal delivery. They do preserve them upon 56238384Sjkim# context switch, but not signalling:-( This means that asynchronous 57238384Sjkim# signals have to be blocked upon entry to this subroutine. Signal 58238384Sjkim# masking (and of course complementary unmasking) has quite an impact 59238384Sjkim# on performance, naturally larger for shorter keys. It's so severe 60238384Sjkim# that 512-bit key performance can be as low as 1/3 of expected one. 61238384Sjkim# This is why this routine can be engaged for longer key operations 62238384Sjkim# only on these OSes, see crypto/ppccap.c for further details. MacOS X 63238384Sjkim# is an exception from this and doesn't require signal masking, and 64238384Sjkim# that's where above improvement coefficients were collected. For 65238384Sjkim# others alternative would be to break dependence on upper halves of 66238384Sjkim# GPRs by sticking to 32-bit integer operations... 67238384Sjkim 68238384Sjkim$flavour = shift; 69238384Sjkim 70238384Sjkimif ($flavour =~ /32/) { 71238384Sjkim $SIZE_T=4; 72238384Sjkim $RZONE= 224; 73238384Sjkim $fname= "bn_mul_mont_fpu64"; 74238384Sjkim 75238384Sjkim $STUX= "stwux"; # store indexed and update 76238384Sjkim $PUSH= "stw"; 77238384Sjkim $POP= "lwz"; 78238384Sjkim} elsif ($flavour =~ /64/) { 79238384Sjkim $SIZE_T=8; 80238384Sjkim $RZONE= 288; 81238384Sjkim $fname= "bn_mul_mont_fpu64"; 82238384Sjkim 83238384Sjkim # same as above, but 64-bit mnemonics... 84238384Sjkim $STUX= "stdux"; # store indexed and update 85238384Sjkim $PUSH= "std"; 86238384Sjkim $POP= "ld"; 87238384Sjkim} else { die "nonsense $flavour"; } 88238384Sjkim 89238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 90238384Sjkim( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 91238384Sjkim( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 92238384Sjkimdie "can't locate ppc-xlate.pl"; 93238384Sjkim 94238384Sjkimopen STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 95238384Sjkim 96238384Sjkim$FRAME=64; # padded frame header 97238384Sjkim$TRANSFER=16*8; 98238384Sjkim 99238384Sjkim$carry="r0"; 100238384Sjkim$sp="r1"; 101238384Sjkim$toc="r2"; 102238384Sjkim$rp="r3"; $ovf="r3"; 103238384Sjkim$ap="r4"; 104238384Sjkim$bp="r5"; 105238384Sjkim$np="r6"; 106238384Sjkim$n0="r7"; 107238384Sjkim$num="r8"; 108238384Sjkim$rp="r9"; # $rp is reassigned 109238384Sjkim$tp="r10"; 110238384Sjkim$j="r11"; 111238384Sjkim$i="r12"; 112238384Sjkim# non-volatile registers 113238384Sjkim$nap_d="r22"; # interleaved ap and np in double format 114238384Sjkim$a0="r23"; # ap[0] 115238384Sjkim$t0="r24"; # temporary registers 116238384Sjkim$t1="r25"; 117238384Sjkim$t2="r26"; 118238384Sjkim$t3="r27"; 119238384Sjkim$t4="r28"; 120238384Sjkim$t5="r29"; 121238384Sjkim$t6="r30"; 122238384Sjkim$t7="r31"; 123238384Sjkim 124238384Sjkim# PPC offers enough register bank capacity to unroll inner loops twice 125238384Sjkim# 126238384Sjkim# ..A3A2A1A0 127238384Sjkim# dcba 128238384Sjkim# ----------- 129238384Sjkim# A0a 130238384Sjkim# A0b 131238384Sjkim# A0c 132238384Sjkim# A0d 133238384Sjkim# A1a 134238384Sjkim# A1b 135238384Sjkim# A1c 136238384Sjkim# A1d 137238384Sjkim# A2a 138238384Sjkim# A2b 139238384Sjkim# A2c 140238384Sjkim# A2d 141238384Sjkim# A3a 142238384Sjkim# A3b 143238384Sjkim# A3c 144238384Sjkim# A3d 145238384Sjkim# ..a 146238384Sjkim# ..b 147238384Sjkim# 148238384Sjkim$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; 149238384Sjkim$na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; 150238384Sjkim$dota="f8"; $dotb="f9"; 151238384Sjkim$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; 152238384Sjkim$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23"; 153238384Sjkim$T0a="f24"; $T0b="f25"; 154238384Sjkim$T1a="f26"; $T1b="f27"; 155238384Sjkim$T2a="f28"; $T2b="f29"; 156238384Sjkim$T3a="f30"; $T3b="f31"; 157238384Sjkim 158238384Sjkim# sp----------->+-------------------------------+ 159238384Sjkim# | saved sp | 160238384Sjkim# +-------------------------------+ 161238384Sjkim# . . 162238384Sjkim# +64 +-------------------------------+ 163238384Sjkim# | 16 gpr<->fpr transfer zone | 164238384Sjkim# . . 165238384Sjkim# . . 166238384Sjkim# +16*8 +-------------------------------+ 167238384Sjkim# | __int64 tmp[-1] | 168238384Sjkim# +-------------------------------+ 169238384Sjkim# | __int64 tmp[num] | 170238384Sjkim# . . 171238384Sjkim# . . 172238384Sjkim# . . 173238384Sjkim# +(num+1)*8 +-------------------------------+ 174238384Sjkim# | padding to 64 byte boundary | 175238384Sjkim# . . 176238384Sjkim# +X +-------------------------------+ 177238384Sjkim# | double nap_d[4*num] | 178238384Sjkim# . . 179238384Sjkim# . . 180238384Sjkim# . . 181238384Sjkim# +-------------------------------+ 182238384Sjkim# . . 183238384Sjkim# -12*size_t +-------------------------------+ 184238384Sjkim# | 10 saved gpr, r22-r31 | 185238384Sjkim# . . 186238384Sjkim# . . 187238384Sjkim# -12*8 +-------------------------------+ 188238384Sjkim# | 12 saved fpr, f20-f31 | 189238384Sjkim# . . 190238384Sjkim# . . 191238384Sjkim# +-------------------------------+ 192238384Sjkim 193238384Sjkim$code=<<___; 194238384Sjkim.machine "any" 195238384Sjkim.text 196238384Sjkim 197238384Sjkim.globl .$fname 198238384Sjkim.align 5 199238384Sjkim.$fname: 200238384Sjkim cmpwi $num,`3*8/$SIZE_T` 201238384Sjkim mr $rp,r3 ; $rp is reassigned 202238384Sjkim li r3,0 ; possible "not handled" return code 203238384Sjkim bltlr- 204238384Sjkim andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" 205238384Sjkim bnelr- 206238384Sjkim 207238384Sjkim slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) 208238384Sjkim li $i,-4096 209238384Sjkim slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num 210238384Sjkim add $tp,$tp,$num ; place for tp[num+1] 211238384Sjkim addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` 212238384Sjkim subf $tp,$tp,$sp ; $sp-$tp 213238384Sjkim and $tp,$tp,$i ; minimize TLB usage 214238384Sjkim subf $tp,$sp,$tp ; $tp-$sp 215238384Sjkim mr $i,$sp 216238384Sjkim $STUX $sp,$sp,$tp ; alloca 217238384Sjkim 218238384Sjkim $PUSH r22,`-12*8-10*$SIZE_T`($i) 219238384Sjkim $PUSH r23,`-12*8-9*$SIZE_T`($i) 220238384Sjkim $PUSH r24,`-12*8-8*$SIZE_T`($i) 221238384Sjkim $PUSH r25,`-12*8-7*$SIZE_T`($i) 222238384Sjkim $PUSH r26,`-12*8-6*$SIZE_T`($i) 223238384Sjkim $PUSH r27,`-12*8-5*$SIZE_T`($i) 224238384Sjkim $PUSH r28,`-12*8-4*$SIZE_T`($i) 225238384Sjkim $PUSH r29,`-12*8-3*$SIZE_T`($i) 226238384Sjkim $PUSH r30,`-12*8-2*$SIZE_T`($i) 227238384Sjkim $PUSH r31,`-12*8-1*$SIZE_T`($i) 228238384Sjkim stfd f20,`-12*8`($i) 229238384Sjkim stfd f21,`-11*8`($i) 230238384Sjkim stfd f22,`-10*8`($i) 231238384Sjkim stfd f23,`-9*8`($i) 232238384Sjkim stfd f24,`-8*8`($i) 233238384Sjkim stfd f25,`-7*8`($i) 234238384Sjkim stfd f26,`-6*8`($i) 235238384Sjkim stfd f27,`-5*8`($i) 236238384Sjkim stfd f28,`-4*8`($i) 237238384Sjkim stfd f29,`-3*8`($i) 238238384Sjkim stfd f30,`-2*8`($i) 239238384Sjkim stfd f31,`-1*8`($i) 240238384Sjkim___ 241238384Sjkim$code.=<<___ if ($SIZE_T==8); 242238384Sjkim ld $a0,0($ap) ; pull ap[0] value 243238384Sjkim ld $n0,0($n0) ; pull n0[0] value 244238384Sjkim ld $t3,0($bp) ; bp[0] 245238384Sjkim___ 246238384Sjkim$code.=<<___ if ($SIZE_T==4); 247238384Sjkim mr $t1,$n0 248238384Sjkim lwz $a0,0($ap) ; pull ap[0,1] value 249238384Sjkim lwz $t0,4($ap) 250238384Sjkim lwz $n0,0($t1) ; pull n0[0,1] value 251238384Sjkim lwz $t1,4($t1) 252238384Sjkim lwz $t3,0($bp) ; bp[0,1] 253238384Sjkim lwz $t2,4($bp) 254238384Sjkim insrdi $a0,$t0,32,0 255238384Sjkim insrdi $n0,$t1,32,0 256238384Sjkim insrdi $t3,$t2,32,0 257238384Sjkim___ 258238384Sjkim$code.=<<___; 259238384Sjkim addi $tp,$sp,`$FRAME+$TRANSFER+8+64` 260238384Sjkim li $i,-64 261238384Sjkim add $nap_d,$tp,$num 262238384Sjkim and $nap_d,$nap_d,$i ; align to 64 bytes 263238384Sjkim 264238384Sjkim mulld $t7,$a0,$t3 ; ap[0]*bp[0] 265238384Sjkim ; nap_d is off by 1, because it's used with stfdu/lfdu 266238384Sjkim addi $nap_d,$nap_d,-8 267238384Sjkim srwi $j,$num,`3+1` ; counter register, num/2 268238384Sjkim mulld $t7,$t7,$n0 ; tp[0]*n0 269238384Sjkim addi $j,$j,-1 270238384Sjkim addi $tp,$sp,`$FRAME+$TRANSFER-8` 271238384Sjkim li $carry,0 272238384Sjkim mtctr $j 273238384Sjkim 274238384Sjkim ; transfer bp[0] to FPU as 4x16-bit values 275238384Sjkim extrdi $t0,$t3,16,48 276238384Sjkim extrdi $t1,$t3,16,32 277238384Sjkim extrdi $t2,$t3,16,16 278238384Sjkim extrdi $t3,$t3,16,0 279238384Sjkim std $t0,`$FRAME+0`($sp) 280238384Sjkim std $t1,`$FRAME+8`($sp) 281238384Sjkim std $t2,`$FRAME+16`($sp) 282238384Sjkim std $t3,`$FRAME+24`($sp) 283238384Sjkim ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values 284238384Sjkim extrdi $t4,$t7,16,48 285238384Sjkim extrdi $t5,$t7,16,32 286238384Sjkim extrdi $t6,$t7,16,16 287238384Sjkim extrdi $t7,$t7,16,0 288238384Sjkim std $t4,`$FRAME+32`($sp) 289238384Sjkim std $t5,`$FRAME+40`($sp) 290238384Sjkim std $t6,`$FRAME+48`($sp) 291238384Sjkim std $t7,`$FRAME+56`($sp) 292238384Sjkim___ 293238384Sjkim$code.=<<___ if ($SIZE_T==8); 294238384Sjkim lwz $t0,4($ap) ; load a[j] as 32-bit word pair 295238384Sjkim lwz $t1,0($ap) 296238384Sjkim lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair 297238384Sjkim lwz $t3,8($ap) 298238384Sjkim lwz $t4,4($np) ; load n[j] as 32-bit word pair 299238384Sjkim lwz $t5,0($np) 300238384Sjkim lwz $t6,12($np) ; load n[j+1] as 32-bit word pair 301238384Sjkim lwz $t7,8($np) 302238384Sjkim___ 303238384Sjkim$code.=<<___ if ($SIZE_T==4); 304238384Sjkim lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs 305238384Sjkim lwz $t1,4($ap) 306238384Sjkim lwz $t2,8($ap) 307238384Sjkim lwz $t3,12($ap) 308238384Sjkim lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs 309238384Sjkim lwz $t5,4($np) 310238384Sjkim lwz $t6,8($np) 311238384Sjkim lwz $t7,12($np) 312238384Sjkim___ 313238384Sjkim$code.=<<___; 314238384Sjkim lfd $ba,`$FRAME+0`($sp) 315238384Sjkim lfd $bb,`$FRAME+8`($sp) 316238384Sjkim lfd $bc,`$FRAME+16`($sp) 317238384Sjkim lfd $bd,`$FRAME+24`($sp) 318238384Sjkim lfd $na,`$FRAME+32`($sp) 319238384Sjkim lfd $nb,`$FRAME+40`($sp) 320238384Sjkim lfd $nc,`$FRAME+48`($sp) 321238384Sjkim lfd $nd,`$FRAME+56`($sp) 322238384Sjkim std $t0,`$FRAME+64`($sp) 323238384Sjkim std $t1,`$FRAME+72`($sp) 324238384Sjkim std $t2,`$FRAME+80`($sp) 325238384Sjkim std $t3,`$FRAME+88`($sp) 326238384Sjkim std $t4,`$FRAME+96`($sp) 327238384Sjkim std $t5,`$FRAME+104`($sp) 328238384Sjkim std $t6,`$FRAME+112`($sp) 329238384Sjkim std $t7,`$FRAME+120`($sp) 330238384Sjkim fcfid $ba,$ba 331238384Sjkim fcfid $bb,$bb 332238384Sjkim fcfid $bc,$bc 333238384Sjkim fcfid $bd,$bd 334238384Sjkim fcfid $na,$na 335238384Sjkim fcfid $nb,$nb 336238384Sjkim fcfid $nc,$nc 337238384Sjkim fcfid $nd,$nd 338238384Sjkim 339238384Sjkim lfd $A0,`$FRAME+64`($sp) 340238384Sjkim lfd $A1,`$FRAME+72`($sp) 341238384Sjkim lfd $A2,`$FRAME+80`($sp) 342238384Sjkim lfd $A3,`$FRAME+88`($sp) 343238384Sjkim lfd $N0,`$FRAME+96`($sp) 344238384Sjkim lfd $N1,`$FRAME+104`($sp) 345238384Sjkim lfd $N2,`$FRAME+112`($sp) 346238384Sjkim lfd $N3,`$FRAME+120`($sp) 347238384Sjkim fcfid $A0,$A0 348238384Sjkim fcfid $A1,$A1 349238384Sjkim fcfid $A2,$A2 350238384Sjkim fcfid $A3,$A3 351238384Sjkim fcfid $N0,$N0 352238384Sjkim fcfid $N1,$N1 353238384Sjkim fcfid $N2,$N2 354238384Sjkim fcfid $N3,$N3 355238384Sjkim addi $ap,$ap,16 356238384Sjkim addi $np,$np,16 357238384Sjkim 358238384Sjkim fmul $T1a,$A1,$ba 359238384Sjkim fmul $T1b,$A1,$bb 360238384Sjkim stfd $A0,8($nap_d) ; save a[j] in double format 361238384Sjkim stfd $A1,16($nap_d) 362238384Sjkim fmul $T2a,$A2,$ba 363238384Sjkim fmul $T2b,$A2,$bb 364238384Sjkim stfd $A2,24($nap_d) ; save a[j+1] in double format 365238384Sjkim stfd $A3,32($nap_d) 366238384Sjkim fmul $T3a,$A3,$ba 367238384Sjkim fmul $T3b,$A3,$bb 368238384Sjkim stfd $N0,40($nap_d) ; save n[j] in double format 369238384Sjkim stfd $N1,48($nap_d) 370238384Sjkim fmul $T0a,$A0,$ba 371238384Sjkim fmul $T0b,$A0,$bb 372238384Sjkim stfd $N2,56($nap_d) ; save n[j+1] in double format 373238384Sjkim stfdu $N3,64($nap_d) 374238384Sjkim 375238384Sjkim fmadd $T1a,$A0,$bc,$T1a 376238384Sjkim fmadd $T1b,$A0,$bd,$T1b 377238384Sjkim fmadd $T2a,$A1,$bc,$T2a 378238384Sjkim fmadd $T2b,$A1,$bd,$T2b 379238384Sjkim fmadd $T3a,$A2,$bc,$T3a 380238384Sjkim fmadd $T3b,$A2,$bd,$T3b 381238384Sjkim fmul $dota,$A3,$bc 382238384Sjkim fmul $dotb,$A3,$bd 383238384Sjkim 384238384Sjkim fmadd $T1a,$N1,$na,$T1a 385238384Sjkim fmadd $T1b,$N1,$nb,$T1b 386238384Sjkim fmadd $T2a,$N2,$na,$T2a 387238384Sjkim fmadd $T2b,$N2,$nb,$T2b 388238384Sjkim fmadd $T3a,$N3,$na,$T3a 389238384Sjkim fmadd $T3b,$N3,$nb,$T3b 390238384Sjkim fmadd $T0a,$N0,$na,$T0a 391238384Sjkim fmadd $T0b,$N0,$nb,$T0b 392238384Sjkim 393238384Sjkim fmadd $T1a,$N0,$nc,$T1a 394238384Sjkim fmadd $T1b,$N0,$nd,$T1b 395238384Sjkim fmadd $T2a,$N1,$nc,$T2a 396238384Sjkim fmadd $T2b,$N1,$nd,$T2b 397238384Sjkim fmadd $T3a,$N2,$nc,$T3a 398238384Sjkim fmadd $T3b,$N2,$nd,$T3b 399238384Sjkim fmadd $dota,$N3,$nc,$dota 400238384Sjkim fmadd $dotb,$N3,$nd,$dotb 401238384Sjkim 402238384Sjkim fctid $T0a,$T0a 403238384Sjkim fctid $T0b,$T0b 404238384Sjkim fctid $T1a,$T1a 405238384Sjkim fctid $T1b,$T1b 406238384Sjkim fctid $T2a,$T2a 407238384Sjkim fctid $T2b,$T2b 408238384Sjkim fctid $T3a,$T3a 409238384Sjkim fctid $T3b,$T3b 410238384Sjkim 411238384Sjkim stfd $T0a,`$FRAME+0`($sp) 412238384Sjkim stfd $T0b,`$FRAME+8`($sp) 413238384Sjkim stfd $T1a,`$FRAME+16`($sp) 414238384Sjkim stfd $T1b,`$FRAME+24`($sp) 415238384Sjkim stfd $T2a,`$FRAME+32`($sp) 416238384Sjkim stfd $T2b,`$FRAME+40`($sp) 417238384Sjkim stfd $T3a,`$FRAME+48`($sp) 418238384Sjkim stfd $T3b,`$FRAME+56`($sp) 419238384Sjkim 420238384Sjkim.align 5 421238384SjkimL1st: 422238384Sjkim___ 423238384Sjkim$code.=<<___ if ($SIZE_T==8); 424238384Sjkim lwz $t0,4($ap) ; load a[j] as 32-bit word pair 425238384Sjkim lwz $t1,0($ap) 426238384Sjkim lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair 427238384Sjkim lwz $t3,8($ap) 428238384Sjkim lwz $t4,4($np) ; load n[j] as 32-bit word pair 429238384Sjkim lwz $t5,0($np) 430238384Sjkim lwz $t6,12($np) ; load n[j+1] as 32-bit word pair 431238384Sjkim lwz $t7,8($np) 432238384Sjkim___ 433238384Sjkim$code.=<<___ if ($SIZE_T==4); 434238384Sjkim lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs 435238384Sjkim lwz $t1,4($ap) 436238384Sjkim lwz $t2,8($ap) 437238384Sjkim lwz $t3,12($ap) 438238384Sjkim lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs 439238384Sjkim lwz $t5,4($np) 440238384Sjkim lwz $t6,8($np) 441238384Sjkim lwz $t7,12($np) 442238384Sjkim___ 443238384Sjkim$code.=<<___; 444238384Sjkim std $t0,`$FRAME+64`($sp) 445238384Sjkim std $t1,`$FRAME+72`($sp) 446238384Sjkim std $t2,`$FRAME+80`($sp) 447238384Sjkim std $t3,`$FRAME+88`($sp) 448238384Sjkim std $t4,`$FRAME+96`($sp) 449238384Sjkim std $t5,`$FRAME+104`($sp) 450238384Sjkim std $t6,`$FRAME+112`($sp) 451238384Sjkim std $t7,`$FRAME+120`($sp) 452238384Sjkim ld $t0,`$FRAME+0`($sp) 453238384Sjkim ld $t1,`$FRAME+8`($sp) 454238384Sjkim ld $t2,`$FRAME+16`($sp) 455238384Sjkim ld $t3,`$FRAME+24`($sp) 456238384Sjkim ld $t4,`$FRAME+32`($sp) 457238384Sjkim ld $t5,`$FRAME+40`($sp) 458238384Sjkim ld $t6,`$FRAME+48`($sp) 459238384Sjkim ld $t7,`$FRAME+56`($sp) 460238384Sjkim lfd $A0,`$FRAME+64`($sp) 461238384Sjkim lfd $A1,`$FRAME+72`($sp) 462238384Sjkim lfd $A2,`$FRAME+80`($sp) 463238384Sjkim lfd $A3,`$FRAME+88`($sp) 464238384Sjkim lfd $N0,`$FRAME+96`($sp) 465238384Sjkim lfd $N1,`$FRAME+104`($sp) 466238384Sjkim lfd $N2,`$FRAME+112`($sp) 467238384Sjkim lfd $N3,`$FRAME+120`($sp) 468238384Sjkim fcfid $A0,$A0 469238384Sjkim fcfid $A1,$A1 470238384Sjkim fcfid $A2,$A2 471238384Sjkim fcfid $A3,$A3 472238384Sjkim fcfid $N0,$N0 473238384Sjkim fcfid $N1,$N1 474238384Sjkim fcfid $N2,$N2 475238384Sjkim fcfid $N3,$N3 476238384Sjkim addi $ap,$ap,16 477238384Sjkim addi $np,$np,16 478238384Sjkim 479238384Sjkim fmul $T1a,$A1,$ba 480238384Sjkim fmul $T1b,$A1,$bb 481238384Sjkim fmul $T2a,$A2,$ba 482238384Sjkim fmul $T2b,$A2,$bb 483238384Sjkim stfd $A0,8($nap_d) ; save a[j] in double format 484238384Sjkim stfd $A1,16($nap_d) 485238384Sjkim fmul $T3a,$A3,$ba 486238384Sjkim fmul $T3b,$A3,$bb 487238384Sjkim fmadd $T0a,$A0,$ba,$dota 488238384Sjkim fmadd $T0b,$A0,$bb,$dotb 489238384Sjkim stfd $A2,24($nap_d) ; save a[j+1] in double format 490238384Sjkim stfd $A3,32($nap_d) 491238384Sjkim 492238384Sjkim fmadd $T1a,$A0,$bc,$T1a 493238384Sjkim fmadd $T1b,$A0,$bd,$T1b 494238384Sjkim fmadd $T2a,$A1,$bc,$T2a 495238384Sjkim fmadd $T2b,$A1,$bd,$T2b 496238384Sjkim stfd $N0,40($nap_d) ; save n[j] in double format 497238384Sjkim stfd $N1,48($nap_d) 498238384Sjkim fmadd $T3a,$A2,$bc,$T3a 499238384Sjkim fmadd $T3b,$A2,$bd,$T3b 500238384Sjkim add $t0,$t0,$carry ; can not overflow 501238384Sjkim fmul $dota,$A3,$bc 502238384Sjkim fmul $dotb,$A3,$bd 503238384Sjkim stfd $N2,56($nap_d) ; save n[j+1] in double format 504238384Sjkim stfdu $N3,64($nap_d) 505238384Sjkim srdi $carry,$t0,16 506238384Sjkim add $t1,$t1,$carry 507238384Sjkim srdi $carry,$t1,16 508238384Sjkim 509238384Sjkim fmadd $T1a,$N1,$na,$T1a 510238384Sjkim fmadd $T1b,$N1,$nb,$T1b 511238384Sjkim insrdi $t0,$t1,16,32 512238384Sjkim fmadd $T2a,$N2,$na,$T2a 513238384Sjkim fmadd $T2b,$N2,$nb,$T2b 514238384Sjkim add $t2,$t2,$carry 515238384Sjkim fmadd $T3a,$N3,$na,$T3a 516238384Sjkim fmadd $T3b,$N3,$nb,$T3b 517238384Sjkim srdi $carry,$t2,16 518238384Sjkim fmadd $T0a,$N0,$na,$T0a 519238384Sjkim fmadd $T0b,$N0,$nb,$T0b 520238384Sjkim insrdi $t0,$t2,16,16 521238384Sjkim add $t3,$t3,$carry 522238384Sjkim srdi $carry,$t3,16 523238384Sjkim 524238384Sjkim fmadd $T1a,$N0,$nc,$T1a 525238384Sjkim fmadd $T1b,$N0,$nd,$T1b 526238384Sjkim insrdi $t0,$t3,16,0 ; 0..63 bits 527238384Sjkim fmadd $T2a,$N1,$nc,$T2a 528238384Sjkim fmadd $T2b,$N1,$nd,$T2b 529238384Sjkim add $t4,$t4,$carry 530238384Sjkim fmadd $T3a,$N2,$nc,$T3a 531238384Sjkim fmadd $T3b,$N2,$nd,$T3b 532238384Sjkim srdi $carry,$t4,16 533238384Sjkim fmadd $dota,$N3,$nc,$dota 534238384Sjkim fmadd $dotb,$N3,$nd,$dotb 535238384Sjkim add $t5,$t5,$carry 536238384Sjkim srdi $carry,$t5,16 537238384Sjkim insrdi $t4,$t5,16,32 538238384Sjkim 539238384Sjkim fctid $T0a,$T0a 540238384Sjkim fctid $T0b,$T0b 541238384Sjkim add $t6,$t6,$carry 542238384Sjkim fctid $T1a,$T1a 543238384Sjkim fctid $T1b,$T1b 544238384Sjkim srdi $carry,$t6,16 545238384Sjkim fctid $T2a,$T2a 546238384Sjkim fctid $T2b,$T2b 547238384Sjkim insrdi $t4,$t6,16,16 548238384Sjkim fctid $T3a,$T3a 549238384Sjkim fctid $T3b,$T3b 550238384Sjkim add $t7,$t7,$carry 551238384Sjkim insrdi $t4,$t7,16,0 ; 64..127 bits 552238384Sjkim srdi $carry,$t7,16 ; upper 33 bits 553238384Sjkim 554238384Sjkim stfd $T0a,`$FRAME+0`($sp) 555238384Sjkim stfd $T0b,`$FRAME+8`($sp) 556238384Sjkim stfd $T1a,`$FRAME+16`($sp) 557238384Sjkim stfd $T1b,`$FRAME+24`($sp) 558238384Sjkim stfd $T2a,`$FRAME+32`($sp) 559238384Sjkim stfd $T2b,`$FRAME+40`($sp) 560238384Sjkim stfd $T3a,`$FRAME+48`($sp) 561238384Sjkim stfd $T3b,`$FRAME+56`($sp) 562238384Sjkim std $t0,8($tp) ; tp[j-1] 563238384Sjkim stdu $t4,16($tp) ; tp[j] 564238384Sjkim bdnz- L1st 565238384Sjkim 566238384Sjkim fctid $dota,$dota 567238384Sjkim fctid $dotb,$dotb 568238384Sjkim 569238384Sjkim ld $t0,`$FRAME+0`($sp) 570238384Sjkim ld $t1,`$FRAME+8`($sp) 571238384Sjkim ld $t2,`$FRAME+16`($sp) 572238384Sjkim ld $t3,`$FRAME+24`($sp) 573238384Sjkim ld $t4,`$FRAME+32`($sp) 574238384Sjkim ld $t5,`$FRAME+40`($sp) 575238384Sjkim ld $t6,`$FRAME+48`($sp) 576238384Sjkim ld $t7,`$FRAME+56`($sp) 577238384Sjkim stfd $dota,`$FRAME+64`($sp) 578238384Sjkim stfd $dotb,`$FRAME+72`($sp) 579238384Sjkim 580238384Sjkim add $t0,$t0,$carry ; can not overflow 581238384Sjkim srdi $carry,$t0,16 582238384Sjkim add $t1,$t1,$carry 583238384Sjkim srdi $carry,$t1,16 584238384Sjkim insrdi $t0,$t1,16,32 585238384Sjkim add $t2,$t2,$carry 586238384Sjkim srdi $carry,$t2,16 587238384Sjkim insrdi $t0,$t2,16,16 588238384Sjkim add $t3,$t3,$carry 589238384Sjkim srdi $carry,$t3,16 590238384Sjkim insrdi $t0,$t3,16,0 ; 0..63 bits 591238384Sjkim add $t4,$t4,$carry 592238384Sjkim srdi $carry,$t4,16 593238384Sjkim add $t5,$t5,$carry 594238384Sjkim srdi $carry,$t5,16 595238384Sjkim insrdi $t4,$t5,16,32 596238384Sjkim add $t6,$t6,$carry 597238384Sjkim srdi $carry,$t6,16 598238384Sjkim insrdi $t4,$t6,16,16 599238384Sjkim add $t7,$t7,$carry 600238384Sjkim insrdi $t4,$t7,16,0 ; 64..127 bits 601238384Sjkim srdi $carry,$t7,16 ; upper 33 bits 602238384Sjkim ld $t6,`$FRAME+64`($sp) 603238384Sjkim ld $t7,`$FRAME+72`($sp) 604238384Sjkim 605238384Sjkim std $t0,8($tp) ; tp[j-1] 606238384Sjkim stdu $t4,16($tp) ; tp[j] 607238384Sjkim 608238384Sjkim add $t6,$t6,$carry ; can not overflow 609238384Sjkim srdi $carry,$t6,16 610238384Sjkim add $t7,$t7,$carry 611238384Sjkim insrdi $t6,$t7,48,0 612238384Sjkim srdi $ovf,$t7,48 613238384Sjkim std $t6,8($tp) ; tp[num-1] 614238384Sjkim 615238384Sjkim slwi $t7,$num,2 616238384Sjkim subf $nap_d,$t7,$nap_d ; rewind pointer 617238384Sjkim 618238384Sjkim li $i,8 ; i=1 619238384Sjkim.align 5 620238384SjkimLouter: 621238384Sjkim___ 622238384Sjkim$code.=<<___ if ($SIZE_T==8); 623238384Sjkim ldx $t3,$bp,$i ; bp[i] 624238384Sjkim___ 625238384Sjkim$code.=<<___ if ($SIZE_T==4); 626238384Sjkim add $t0,$bp,$i 627238384Sjkim lwz $t3,0($t0) ; bp[i,i+1] 628238384Sjkim lwz $t0,4($t0) 629238384Sjkim insrdi $t3,$t0,32,0 630238384Sjkim___ 631238384Sjkim$code.=<<___; 632238384Sjkim ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] 633238384Sjkim mulld $t7,$a0,$t3 ; ap[0]*bp[i] 634238384Sjkim 635238384Sjkim addi $tp,$sp,`$FRAME+$TRANSFER` 636238384Sjkim add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] 637238384Sjkim li $carry,0 638238384Sjkim mulld $t7,$t7,$n0 ; tp[0]*n0 639238384Sjkim mtctr $j 640238384Sjkim 641238384Sjkim ; transfer bp[i] to FPU as 4x16-bit values 642238384Sjkim extrdi $t0,$t3,16,48 643238384Sjkim extrdi $t1,$t3,16,32 644238384Sjkim extrdi $t2,$t3,16,16 645238384Sjkim extrdi $t3,$t3,16,0 646238384Sjkim std $t0,`$FRAME+0`($sp) 647238384Sjkim std $t1,`$FRAME+8`($sp) 648238384Sjkim std $t2,`$FRAME+16`($sp) 649238384Sjkim std $t3,`$FRAME+24`($sp) 650238384Sjkim ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values 651238384Sjkim extrdi $t4,$t7,16,48 652238384Sjkim extrdi $t5,$t7,16,32 653238384Sjkim extrdi $t6,$t7,16,16 654238384Sjkim extrdi $t7,$t7,16,0 655238384Sjkim std $t4,`$FRAME+32`($sp) 656238384Sjkim std $t5,`$FRAME+40`($sp) 657238384Sjkim std $t6,`$FRAME+48`($sp) 658238384Sjkim std $t7,`$FRAME+56`($sp) 659238384Sjkim 660238384Sjkim lfd $A0,8($nap_d) ; load a[j] in double format 661238384Sjkim lfd $A1,16($nap_d) 662238384Sjkim lfd $A2,24($nap_d) ; load a[j+1] in double format 663238384Sjkim lfd $A3,32($nap_d) 664238384Sjkim lfd $N0,40($nap_d) ; load n[j] in double format 665238384Sjkim lfd $N1,48($nap_d) 666238384Sjkim lfd $N2,56($nap_d) ; load n[j+1] in double format 667238384Sjkim lfdu $N3,64($nap_d) 668238384Sjkim 669238384Sjkim lfd $ba,`$FRAME+0`($sp) 670238384Sjkim lfd $bb,`$FRAME+8`($sp) 671238384Sjkim lfd $bc,`$FRAME+16`($sp) 672238384Sjkim lfd $bd,`$FRAME+24`($sp) 673238384Sjkim lfd $na,`$FRAME+32`($sp) 674238384Sjkim lfd $nb,`$FRAME+40`($sp) 675238384Sjkim lfd $nc,`$FRAME+48`($sp) 676238384Sjkim lfd $nd,`$FRAME+56`($sp) 677238384Sjkim 678238384Sjkim fcfid $ba,$ba 679238384Sjkim fcfid $bb,$bb 680238384Sjkim fcfid $bc,$bc 681238384Sjkim fcfid $bd,$bd 682238384Sjkim fcfid $na,$na 683238384Sjkim fcfid $nb,$nb 684238384Sjkim fcfid $nc,$nc 685238384Sjkim fcfid $nd,$nd 686238384Sjkim 687238384Sjkim fmul $T1a,$A1,$ba 688238384Sjkim fmul $T1b,$A1,$bb 689238384Sjkim fmul $T2a,$A2,$ba 690238384Sjkim fmul $T2b,$A2,$bb 691238384Sjkim fmul $T3a,$A3,$ba 692238384Sjkim fmul $T3b,$A3,$bb 693238384Sjkim fmul $T0a,$A0,$ba 694238384Sjkim fmul $T0b,$A0,$bb 695238384Sjkim 696238384Sjkim fmadd $T1a,$A0,$bc,$T1a 697238384Sjkim fmadd $T1b,$A0,$bd,$T1b 698238384Sjkim fmadd $T2a,$A1,$bc,$T2a 699238384Sjkim fmadd $T2b,$A1,$bd,$T2b 700238384Sjkim fmadd $T3a,$A2,$bc,$T3a 701238384Sjkim fmadd $T3b,$A2,$bd,$T3b 702238384Sjkim fmul $dota,$A3,$bc 703238384Sjkim fmul $dotb,$A3,$bd 704238384Sjkim 705238384Sjkim fmadd $T1a,$N1,$na,$T1a 706238384Sjkim fmadd $T1b,$N1,$nb,$T1b 707238384Sjkim lfd $A0,8($nap_d) ; load a[j] in double format 708238384Sjkim lfd $A1,16($nap_d) 709238384Sjkim fmadd $T2a,$N2,$na,$T2a 710238384Sjkim fmadd $T2b,$N2,$nb,$T2b 711238384Sjkim lfd $A2,24($nap_d) ; load a[j+1] in double format 712238384Sjkim lfd $A3,32($nap_d) 713238384Sjkim fmadd $T3a,$N3,$na,$T3a 714238384Sjkim fmadd $T3b,$N3,$nb,$T3b 715238384Sjkim fmadd $T0a,$N0,$na,$T0a 716238384Sjkim fmadd $T0b,$N0,$nb,$T0b 717238384Sjkim 718238384Sjkim fmadd $T1a,$N0,$nc,$T1a 719238384Sjkim fmadd $T1b,$N0,$nd,$T1b 720238384Sjkim fmadd $T2a,$N1,$nc,$T2a 721238384Sjkim fmadd $T2b,$N1,$nd,$T2b 722238384Sjkim fmadd $T3a,$N2,$nc,$T3a 723238384Sjkim fmadd $T3b,$N2,$nd,$T3b 724238384Sjkim fmadd $dota,$N3,$nc,$dota 725238384Sjkim fmadd $dotb,$N3,$nd,$dotb 726238384Sjkim 727238384Sjkim fctid $T0a,$T0a 728238384Sjkim fctid $T0b,$T0b 729238384Sjkim fctid $T1a,$T1a 730238384Sjkim fctid $T1b,$T1b 731238384Sjkim fctid $T2a,$T2a 732238384Sjkim fctid $T2b,$T2b 733238384Sjkim fctid $T3a,$T3a 734238384Sjkim fctid $T3b,$T3b 735238384Sjkim 736238384Sjkim stfd $T0a,`$FRAME+0`($sp) 737238384Sjkim stfd $T0b,`$FRAME+8`($sp) 738238384Sjkim stfd $T1a,`$FRAME+16`($sp) 739238384Sjkim stfd $T1b,`$FRAME+24`($sp) 740238384Sjkim stfd $T2a,`$FRAME+32`($sp) 741238384Sjkim stfd $T2b,`$FRAME+40`($sp) 742238384Sjkim stfd $T3a,`$FRAME+48`($sp) 743238384Sjkim stfd $T3b,`$FRAME+56`($sp) 744238384Sjkim 745238384Sjkim.align 5 746238384SjkimLinner: 747238384Sjkim fmul $T1a,$A1,$ba 748238384Sjkim fmul $T1b,$A1,$bb 749238384Sjkim fmul $T2a,$A2,$ba 750238384Sjkim fmul $T2b,$A2,$bb 751238384Sjkim lfd $N0,40($nap_d) ; load n[j] in double format 752238384Sjkim lfd $N1,48($nap_d) 753238384Sjkim fmul $T3a,$A3,$ba 754238384Sjkim fmul $T3b,$A3,$bb 755238384Sjkim fmadd $T0a,$A0,$ba,$dota 756238384Sjkim fmadd $T0b,$A0,$bb,$dotb 757238384Sjkim lfd $N2,56($nap_d) ; load n[j+1] in double format 758238384Sjkim lfdu $N3,64($nap_d) 759238384Sjkim 760238384Sjkim fmadd $T1a,$A0,$bc,$T1a 761238384Sjkim fmadd $T1b,$A0,$bd,$T1b 762238384Sjkim fmadd $T2a,$A1,$bc,$T2a 763238384Sjkim fmadd $T2b,$A1,$bd,$T2b 764238384Sjkim lfd $A0,8($nap_d) ; load a[j] in double format 765238384Sjkim lfd $A1,16($nap_d) 766238384Sjkim fmadd $T3a,$A2,$bc,$T3a 767238384Sjkim fmadd $T3b,$A2,$bd,$T3b 768238384Sjkim fmul $dota,$A3,$bc 769238384Sjkim fmul $dotb,$A3,$bd 770238384Sjkim lfd $A2,24($nap_d) ; load a[j+1] in double format 771238384Sjkim lfd $A3,32($nap_d) 772238384Sjkim 773238384Sjkim fmadd $T1a,$N1,$na,$T1a 774238384Sjkim fmadd $T1b,$N1,$nb,$T1b 775238384Sjkim ld $t0,`$FRAME+0`($sp) 776238384Sjkim ld $t1,`$FRAME+8`($sp) 777238384Sjkim fmadd $T2a,$N2,$na,$T2a 778238384Sjkim fmadd $T2b,$N2,$nb,$T2b 779238384Sjkim ld $t2,`$FRAME+16`($sp) 780238384Sjkim ld $t3,`$FRAME+24`($sp) 781238384Sjkim fmadd $T3a,$N3,$na,$T3a 782238384Sjkim fmadd $T3b,$N3,$nb,$T3b 783238384Sjkim add $t0,$t0,$carry ; can not overflow 784238384Sjkim ld $t4,`$FRAME+32`($sp) 785238384Sjkim ld $t5,`$FRAME+40`($sp) 786238384Sjkim fmadd $T0a,$N0,$na,$T0a 787238384Sjkim fmadd $T0b,$N0,$nb,$T0b 788238384Sjkim srdi $carry,$t0,16 789238384Sjkim add $t1,$t1,$carry 790238384Sjkim srdi $carry,$t1,16 791238384Sjkim ld $t6,`$FRAME+48`($sp) 792238384Sjkim ld $t7,`$FRAME+56`($sp) 793238384Sjkim 794238384Sjkim fmadd $T1a,$N0,$nc,$T1a 795238384Sjkim fmadd $T1b,$N0,$nd,$T1b 796238384Sjkim insrdi $t0,$t1,16,32 797238384Sjkim ld $t1,8($tp) ; tp[j] 798238384Sjkim fmadd $T2a,$N1,$nc,$T2a 799238384Sjkim fmadd $T2b,$N1,$nd,$T2b 800238384Sjkim add $t2,$t2,$carry 801238384Sjkim fmadd $T3a,$N2,$nc,$T3a 802238384Sjkim fmadd $T3b,$N2,$nd,$T3b 803238384Sjkim srdi $carry,$t2,16 804238384Sjkim insrdi $t0,$t2,16,16 805238384Sjkim fmadd $dota,$N3,$nc,$dota 806238384Sjkim fmadd $dotb,$N3,$nd,$dotb 807238384Sjkim add $t3,$t3,$carry 808238384Sjkim ldu $t2,16($tp) ; tp[j+1] 809238384Sjkim srdi $carry,$t3,16 810238384Sjkim insrdi $t0,$t3,16,0 ; 0..63 bits 811238384Sjkim add $t4,$t4,$carry 812238384Sjkim 813238384Sjkim fctid $T0a,$T0a 814238384Sjkim fctid $T0b,$T0b 815238384Sjkim srdi $carry,$t4,16 816238384Sjkim fctid $T1a,$T1a 817238384Sjkim fctid $T1b,$T1b 818238384Sjkim add $t5,$t5,$carry 819238384Sjkim fctid $T2a,$T2a 820238384Sjkim fctid $T2b,$T2b 821238384Sjkim srdi $carry,$t5,16 822238384Sjkim insrdi $t4,$t5,16,32 823238384Sjkim fctid $T3a,$T3a 824238384Sjkim fctid $T3b,$T3b 825238384Sjkim add $t6,$t6,$carry 826238384Sjkim srdi $carry,$t6,16 827238384Sjkim insrdi $t4,$t6,16,16 828238384Sjkim 829238384Sjkim stfd $T0a,`$FRAME+0`($sp) 830238384Sjkim stfd $T0b,`$FRAME+8`($sp) 831238384Sjkim add $t7,$t7,$carry 832238384Sjkim addc $t3,$t0,$t1 833238384Sjkim___ 834238384Sjkim$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 835238384Sjkim extrdi $t0,$t0,32,0 836238384Sjkim extrdi $t1,$t1,32,0 837238384Sjkim adde $t0,$t0,$t1 838238384Sjkim___ 839238384Sjkim$code.=<<___; 840238384Sjkim stfd $T1a,`$FRAME+16`($sp) 841238384Sjkim stfd $T1b,`$FRAME+24`($sp) 842238384Sjkim insrdi $t4,$t7,16,0 ; 64..127 bits 843238384Sjkim srdi $carry,$t7,16 ; upper 33 bits 844238384Sjkim stfd $T2a,`$FRAME+32`($sp) 845238384Sjkim stfd $T2b,`$FRAME+40`($sp) 846238384Sjkim adde $t5,$t4,$t2 847238384Sjkim___ 848238384Sjkim$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 849238384Sjkim extrdi $t4,$t4,32,0 850238384Sjkim extrdi $t2,$t2,32,0 851238384Sjkim adde $t4,$t4,$t2 852238384Sjkim___ 853238384Sjkim$code.=<<___; 854238384Sjkim stfd $T3a,`$FRAME+48`($sp) 855238384Sjkim stfd $T3b,`$FRAME+56`($sp) 856238384Sjkim addze $carry,$carry 857238384Sjkim std $t3,-16($tp) ; tp[j-1] 858238384Sjkim std $t5,-8($tp) ; tp[j] 859238384Sjkim bdnz- Linner 860238384Sjkim 861238384Sjkim fctid $dota,$dota 862238384Sjkim fctid $dotb,$dotb 863238384Sjkim ld $t0,`$FRAME+0`($sp) 864238384Sjkim ld $t1,`$FRAME+8`($sp) 865238384Sjkim ld $t2,`$FRAME+16`($sp) 866238384Sjkim ld $t3,`$FRAME+24`($sp) 867238384Sjkim ld $t4,`$FRAME+32`($sp) 868238384Sjkim ld $t5,`$FRAME+40`($sp) 869238384Sjkim ld $t6,`$FRAME+48`($sp) 870238384Sjkim ld $t7,`$FRAME+56`($sp) 871238384Sjkim stfd $dota,`$FRAME+64`($sp) 872238384Sjkim stfd $dotb,`$FRAME+72`($sp) 873238384Sjkim 874238384Sjkim add $t0,$t0,$carry ; can not overflow 875238384Sjkim srdi $carry,$t0,16 876238384Sjkim add $t1,$t1,$carry 877238384Sjkim srdi $carry,$t1,16 878238384Sjkim insrdi $t0,$t1,16,32 879238384Sjkim add $t2,$t2,$carry 880238384Sjkim ld $t1,8($tp) ; tp[j] 881238384Sjkim srdi $carry,$t2,16 882238384Sjkim insrdi $t0,$t2,16,16 883238384Sjkim add $t3,$t3,$carry 884238384Sjkim ldu $t2,16($tp) ; tp[j+1] 885238384Sjkim srdi $carry,$t3,16 886238384Sjkim insrdi $t0,$t3,16,0 ; 0..63 bits 887238384Sjkim add $t4,$t4,$carry 888238384Sjkim srdi $carry,$t4,16 889238384Sjkim add $t5,$t5,$carry 890238384Sjkim srdi $carry,$t5,16 891238384Sjkim insrdi $t4,$t5,16,32 892238384Sjkim add $t6,$t6,$carry 893238384Sjkim srdi $carry,$t6,16 894238384Sjkim insrdi $t4,$t6,16,16 895238384Sjkim add $t7,$t7,$carry 896238384Sjkim insrdi $t4,$t7,16,0 ; 64..127 bits 897238384Sjkim srdi $carry,$t7,16 ; upper 33 bits 898238384Sjkim ld $t6,`$FRAME+64`($sp) 899238384Sjkim ld $t7,`$FRAME+72`($sp) 900238384Sjkim 901238384Sjkim addc $t3,$t0,$t1 902238384Sjkim___ 903238384Sjkim$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 904238384Sjkim extrdi $t0,$t0,32,0 905238384Sjkim extrdi $t1,$t1,32,0 906238384Sjkim adde $t0,$t0,$t1 907238384Sjkim___ 908238384Sjkim$code.=<<___; 909238384Sjkim adde $t5,$t4,$t2 910238384Sjkim___ 911238384Sjkim$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 912238384Sjkim extrdi $t4,$t4,32,0 913238384Sjkim extrdi $t2,$t2,32,0 914238384Sjkim adde $t4,$t4,$t2 915238384Sjkim___ 916238384Sjkim$code.=<<___; 917238384Sjkim addze $carry,$carry 918238384Sjkim 919238384Sjkim std $t3,-16($tp) ; tp[j-1] 920238384Sjkim std $t5,-8($tp) ; tp[j] 921238384Sjkim 922238384Sjkim add $carry,$carry,$ovf ; comsume upmost overflow 923238384Sjkim add $t6,$t6,$carry ; can not overflow 924238384Sjkim srdi $carry,$t6,16 925238384Sjkim add $t7,$t7,$carry 926238384Sjkim insrdi $t6,$t7,48,0 927238384Sjkim srdi $ovf,$t7,48 928238384Sjkim std $t6,0($tp) ; tp[num-1] 929238384Sjkim 930238384Sjkim slwi $t7,$num,2 931238384Sjkim addi $i,$i,8 932238384Sjkim subf $nap_d,$t7,$nap_d ; rewind pointer 933238384Sjkim cmpw $i,$num 934238384Sjkim blt- Louter 935238384Sjkim___ 936238384Sjkim 937238384Sjkim$code.=<<___ if ($SIZE_T==8); 938238384Sjkim subf $np,$num,$np ; rewind np 939238384Sjkim addi $j,$j,1 ; restore counter 940238384Sjkim subfc $i,$i,$i ; j=0 and "clear" XER[CA] 941238384Sjkim addi $tp,$sp,`$FRAME+$TRANSFER+8` 942238384Sjkim addi $t4,$sp,`$FRAME+$TRANSFER+16` 943238384Sjkim addi $t5,$np,8 944238384Sjkim addi $t6,$rp,8 945238384Sjkim mtctr $j 946238384Sjkim 947238384Sjkim.align 4 948238384SjkimLsub: ldx $t0,$tp,$i 949238384Sjkim ldx $t1,$np,$i 950238384Sjkim ldx $t2,$t4,$i 951238384Sjkim ldx $t3,$t5,$i 952238384Sjkim subfe $t0,$t1,$t0 ; tp[j]-np[j] 953238384Sjkim subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] 954238384Sjkim stdx $t0,$rp,$i 955238384Sjkim stdx $t2,$t6,$i 956238384Sjkim addi $i,$i,16 957238384Sjkim bdnz- Lsub 958238384Sjkim 959238384Sjkim li $i,0 960238384Sjkim subfe $ovf,$i,$ovf ; handle upmost overflow bit 961238384Sjkim and $ap,$tp,$ovf 962238384Sjkim andc $np,$rp,$ovf 963238384Sjkim or $ap,$ap,$np ; ap=borrow?tp:rp 964238384Sjkim addi $t7,$ap,8 965238384Sjkim mtctr $j 966238384Sjkim 967238384Sjkim.align 4 968238384SjkimLcopy: ; copy or in-place refresh 969238384Sjkim ldx $t0,$ap,$i 970238384Sjkim ldx $t1,$t7,$i 971238384Sjkim std $i,8($nap_d) ; zap nap_d 972238384Sjkim std $i,16($nap_d) 973238384Sjkim std $i,24($nap_d) 974238384Sjkim std $i,32($nap_d) 975238384Sjkim std $i,40($nap_d) 976238384Sjkim std $i,48($nap_d) 977238384Sjkim std $i,56($nap_d) 978238384Sjkim stdu $i,64($nap_d) 979238384Sjkim stdx $t0,$rp,$i 980238384Sjkim stdx $t1,$t6,$i 981238384Sjkim stdx $i,$tp,$i ; zap tp at once 982238384Sjkim stdx $i,$t4,$i 983238384Sjkim addi $i,$i,16 984238384Sjkim bdnz- Lcopy 985238384Sjkim___ 986238384Sjkim$code.=<<___ if ($SIZE_T==4); 987238384Sjkim subf $np,$num,$np ; rewind np 988238384Sjkim addi $j,$j,1 ; restore counter 989238384Sjkim subfc $i,$i,$i ; j=0 and "clear" XER[CA] 990238384Sjkim addi $tp,$sp,`$FRAME+$TRANSFER` 991238384Sjkim addi $np,$np,-4 992238384Sjkim addi $rp,$rp,-4 993238384Sjkim addi $ap,$sp,`$FRAME+$TRANSFER+4` 994238384Sjkim mtctr $j 995238384Sjkim 996238384Sjkim.align 4 997238384SjkimLsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order 998238384Sjkim ldu $t2,16($tp) 999238384Sjkim lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order 1000238384Sjkim lwz $t5,8($np) 1001238384Sjkim lwz $t6,12($np) 1002238384Sjkim lwzu $t7,16($np) 1003238384Sjkim extrdi $t1,$t0,32,0 1004238384Sjkim extrdi $t3,$t2,32,0 1005238384Sjkim subfe $t4,$t4,$t0 ; tp[j]-np[j] 1006238384Sjkim stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order 1007238384Sjkim subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] 1008238384Sjkim stw $t1,8($ap) 1009238384Sjkim subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] 1010238384Sjkim stw $t2,12($ap) 1011238384Sjkim subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] 1012238384Sjkim stwu $t3,16($ap) 1013238384Sjkim stw $t4,4($rp) 1014238384Sjkim stw $t5,8($rp) 1015238384Sjkim stw $t6,12($rp) 1016238384Sjkim stwu $t7,16($rp) 1017238384Sjkim bdnz- Lsub 1018238384Sjkim 1019238384Sjkim li $i,0 1020238384Sjkim subfe $ovf,$i,$ovf ; handle upmost overflow bit 1021238384Sjkim addi $tp,$sp,`$FRAME+$TRANSFER+4` 1022238384Sjkim subf $rp,$num,$rp ; rewind rp 1023238384Sjkim and $ap,$tp,$ovf 1024238384Sjkim andc $np,$rp,$ovf 1025238384Sjkim or $ap,$ap,$np ; ap=borrow?tp:rp 1026238384Sjkim addi $tp,$sp,`$FRAME+$TRANSFER` 1027238384Sjkim mtctr $j 1028238384Sjkim 1029238384Sjkim.align 4 1030238384SjkimLcopy: ; copy or in-place refresh 1031238384Sjkim lwz $t0,4($ap) 1032238384Sjkim lwz $t1,8($ap) 1033238384Sjkim lwz $t2,12($ap) 1034238384Sjkim lwzu $t3,16($ap) 1035238384Sjkim std $i,8($nap_d) ; zap nap_d 1036238384Sjkim std $i,16($nap_d) 1037238384Sjkim std $i,24($nap_d) 1038238384Sjkim std $i,32($nap_d) 1039238384Sjkim std $i,40($nap_d) 1040238384Sjkim std $i,48($nap_d) 1041238384Sjkim std $i,56($nap_d) 1042238384Sjkim stdu $i,64($nap_d) 1043238384Sjkim stw $t0,4($rp) 1044238384Sjkim stw $t1,8($rp) 1045238384Sjkim stw $t2,12($rp) 1046238384Sjkim stwu $t3,16($rp) 1047238384Sjkim std $i,8($tp) ; zap tp at once 1048238384Sjkim stdu $i,16($tp) 1049238384Sjkim bdnz- Lcopy 1050238384Sjkim___ 1051238384Sjkim 1052238384Sjkim$code.=<<___; 1053238384Sjkim $POP $i,0($sp) 1054238384Sjkim li r3,1 ; signal "handled" 1055238384Sjkim $POP r22,`-12*8-10*$SIZE_T`($i) 1056238384Sjkim $POP r23,`-12*8-9*$SIZE_T`($i) 1057238384Sjkim $POP r24,`-12*8-8*$SIZE_T`($i) 1058238384Sjkim $POP r25,`-12*8-7*$SIZE_T`($i) 1059238384Sjkim $POP r26,`-12*8-6*$SIZE_T`($i) 1060238384Sjkim $POP r27,`-12*8-5*$SIZE_T`($i) 1061238384Sjkim $POP r28,`-12*8-4*$SIZE_T`($i) 1062238384Sjkim $POP r29,`-12*8-3*$SIZE_T`($i) 1063238384Sjkim $POP r30,`-12*8-2*$SIZE_T`($i) 1064238384Sjkim $POP r31,`-12*8-1*$SIZE_T`($i) 1065238384Sjkim lfd f20,`-12*8`($i) 1066238384Sjkim lfd f21,`-11*8`($i) 1067238384Sjkim lfd f22,`-10*8`($i) 1068238384Sjkim lfd f23,`-9*8`($i) 1069238384Sjkim lfd f24,`-8*8`($i) 1070238384Sjkim lfd f25,`-7*8`($i) 1071238384Sjkim lfd f26,`-6*8`($i) 1072238384Sjkim lfd f27,`-5*8`($i) 1073238384Sjkim lfd f28,`-4*8`($i) 1074238384Sjkim lfd f29,`-3*8`($i) 1075238384Sjkim lfd f30,`-2*8`($i) 1076238384Sjkim lfd f31,`-1*8`($i) 1077238384Sjkim mr $sp,$i 1078238384Sjkim blr 1079238384Sjkim .long 0 1080238384Sjkim .byte 0,12,4,0,0x8c,10,6,0 1081238384Sjkim .long 0 1082238384Sjkim 1083238384Sjkim.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" 1084238384Sjkim___ 1085238384Sjkim 1086238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem; 1087238384Sjkimprint $code; 1088238384Sjkimclose STDOUT; 1089