1160814Ssimon#!/usr/bin/env perl 2160814Ssimon# 3160814Ssimon# Implemented as a Perl wrapper as we want to support several different 4160814Ssimon# architectures with single file. We pick up the target based on the 5160814Ssimon# file name we are asked to generate. 6160814Ssimon# 7160814Ssimon# It should be noted though that this perl code is nothing like 8160814Ssimon# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 9160814Ssimon# as pre-processor to cover for platform differences in name decoration, 10160814Ssimon# linker tables, 32-/64-bit instruction sets... 11160814Ssimon# 12160814Ssimon# As you might know there're several PowerPC ABI in use. Most notably 13160814Ssimon# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs 14160814Ssimon# are similar enough to implement leaf(!) functions, which would be ABI 15160814Ssimon# neutral. And that's what you find here: ABI neutral leaf functions. 16160814Ssimon# In case you wonder what that is... 17160814Ssimon# 18160814Ssimon# AIX performance 19160814Ssimon# 20160814Ssimon# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. 21160814Ssimon# 22160814Ssimon# The following is the performance of 32-bit compiler 23160814Ssimon# generated code: 24160814Ssimon# 25160814Ssimon# OpenSSL 0.9.6c 21 dec 2001 26160814Ssimon# built on: Tue Jun 11 11:06:51 EDT 2002 27160814Ssimon# options:bn(64,32) ... 28160814Ssimon#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 29160814Ssimon# sign verify sign/s verify/s 30160814Ssimon#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 31160814Ssimon#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 32160814Ssimon#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 33160814Ssimon#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 34160814Ssimon#dsa 512 bits 0.0087s 0.0106s 114.3 94.5 35160814Ssimon#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 36160814Ssimon# 37160814Ssimon# Same bechmark with this assembler code: 38160814Ssimon# 39160814Ssimon#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 40160814Ssimon#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 41160814Ssimon#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 42160814Ssimon#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 43160814Ssimon#dsa 512 bits 0.0052s 0.0062s 191.6 162.0 44160814Ssimon#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 45160814Ssimon# 46160814Ssimon# Number of operations increases by at almost 75% 47160814Ssimon# 48160814Ssimon# Here are performance numbers for 64-bit compiler 49160814Ssimon# generated code: 50160814Ssimon# 51160814Ssimon# OpenSSL 0.9.6g [engine] 9 Aug 2002 52160814Ssimon# built on: Fri Apr 18 16:59:20 EDT 2003 53160814Ssimon# options:bn(64,64) ... 54160814Ssimon# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 55160814Ssimon# sign verify sign/s verify/s 56160814Ssimon#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 57160814Ssimon#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 58160814Ssimon#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 59160814Ssimon#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 60160814Ssimon#dsa 512 bits 0.0026s 0.0032s 382.5 313.7 61160814Ssimon#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 62160814Ssimon# 63160814Ssimon# Same benchmark with this assembler code: 64160814Ssimon# 65160814Ssimon#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 66160814Ssimon#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 67160814Ssimon#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 68160814Ssimon#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 69160814Ssimon#dsa 512 bits 0.0016s 0.0020s 610.7 507.1 70160814Ssimon#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 71160814Ssimon# 72160814Ssimon# Again, performance increases by at about 75% 73160814Ssimon# 74160814Ssimon# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) 75160814Ssimon# OpenSSL 0.9.7c 30 Sep 2003 76160814Ssimon# 77160814Ssimon# Original code. 78160814Ssimon# 79160814Ssimon#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 80160814Ssimon#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 81160814Ssimon#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 82160814Ssimon#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 83160814Ssimon#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 84160814Ssimon#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 85160814Ssimon#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 86160814Ssimon# 87160814Ssimon# Same benchmark with this assembler code: 88160814Ssimon# 89160814Ssimon#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 90160814Ssimon#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 91160814Ssimon#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 92160814Ssimon#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 93160814Ssimon#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 94160814Ssimon#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 95160814Ssimon#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 96160814Ssimon# 97160814Ssimon# Performance increase of ~60% 98160814Ssimon# 99160814Ssimon# If you have comments or suggestions to improve code send 100160814Ssimon# me a note at schari@us.ibm.com 101160814Ssimon# 102160814Ssimon 103238405Sjkim$flavour = shift; 104160814Ssimon 105238405Sjkimif ($flavour =~ /32/) { 106160814Ssimon $BITS= 32; 107160814Ssimon $BNSZ= $BITS/8; 108160814Ssimon $ISA= "\"ppc\""; 109160814Ssimon 110160814Ssimon $LD= "lwz"; # load 111160814Ssimon $LDU= "lwzu"; # load and update 112160814Ssimon $ST= "stw"; # store 113160814Ssimon $STU= "stwu"; # store and update 114160814Ssimon $UMULL= "mullw"; # unsigned multiply low 115160814Ssimon $UMULH= "mulhwu"; # unsigned multiply high 116160814Ssimon $UDIV= "divwu"; # unsigned divide 117160814Ssimon $UCMPI= "cmplwi"; # unsigned compare with immediate 118160814Ssimon $UCMP= "cmplw"; # unsigned compare 119160814Ssimon $CNTLZ= "cntlzw"; # count leading zeros 120160814Ssimon $SHL= "slw"; # shift left 121160814Ssimon $SHR= "srw"; # unsigned shift right 122160814Ssimon $SHRI= "srwi"; # unsigned shift right by immediate 123160814Ssimon $SHLI= "slwi"; # shift left by immediate 124160814Ssimon $CLRU= "clrlwi"; # clear upper bits 125160814Ssimon $INSR= "insrwi"; # insert right 126160814Ssimon $ROTL= "rotlwi"; # rotate left by immediate 127160814Ssimon $TR= "tw"; # conditional trap 128238405Sjkim} elsif ($flavour =~ /64/) { 129160814Ssimon $BITS= 64; 130160814Ssimon $BNSZ= $BITS/8; 131160814Ssimon $ISA= "\"ppc64\""; 132160814Ssimon 133160814Ssimon # same as above, but 64-bit mnemonics... 134160814Ssimon $LD= "ld"; # load 135160814Ssimon $LDU= "ldu"; # load and update 136160814Ssimon $ST= "std"; # store 137160814Ssimon $STU= "stdu"; # store and update 138160814Ssimon $UMULL= "mulld"; # unsigned multiply low 139160814Ssimon $UMULH= "mulhdu"; # unsigned multiply high 140160814Ssimon $UDIV= "divdu"; # unsigned divide 141160814Ssimon $UCMPI= "cmpldi"; # unsigned compare with immediate 142160814Ssimon $UCMP= "cmpld"; # unsigned compare 143160814Ssimon $CNTLZ= "cntlzd"; # count leading zeros 144160814Ssimon $SHL= "sld"; # shift left 145160814Ssimon $SHR= "srd"; # unsigned shift right 146160814Ssimon $SHRI= "srdi"; # unsigned shift right by immediate 147160814Ssimon $SHLI= "sldi"; # shift left by immediate 148160814Ssimon $CLRU= "clrldi"; # clear upper bits 149160814Ssimon $INSR= "insrdi"; # insert right 150160814Ssimon $ROTL= "rotldi"; # rotate left by immediate 151160814Ssimon $TR= "td"; # conditional trap 152238405Sjkim} else { die "nonsense $flavour"; } 153160814Ssimon 154238405Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 155238405Sjkim( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 156238405Sjkim( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 157238405Sjkimdie "can't locate ppc-xlate.pl"; 158160814Ssimon 159238405Sjkimopen STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 160160814Ssimon 161238405Sjkim$data=<<EOF; 162160814Ssimon#-------------------------------------------------------------------- 163160814Ssimon# 164160814Ssimon# 165160814Ssimon# 166160814Ssimon# 167160814Ssimon# File: ppc32.s 168160814Ssimon# 169160814Ssimon# Created by: Suresh Chari 170160814Ssimon# IBM Thomas J. Watson Research Library 171160814Ssimon# Hawthorne, NY 172160814Ssimon# 173160814Ssimon# 174160814Ssimon# Description: Optimized assembly routines for OpenSSL crypto 175160814Ssimon# on the 32 bitPowerPC platform. 176160814Ssimon# 177160814Ssimon# 178160814Ssimon# Version History 179160814Ssimon# 180160814Ssimon# 2. Fixed bn_add,bn_sub and bn_div_words, added comments, 181160814Ssimon# cleaned up code. Also made a single version which can 182160814Ssimon# be used for both the AIX and Linux compilers. See NOTE 183160814Ssimon# below. 184160814Ssimon# 12/05/03 Suresh Chari 185160814Ssimon# (with lots of help from) Andy Polyakov 186160814Ssimon## 187160814Ssimon# 1. Initial version 10/20/02 Suresh Chari 188160814Ssimon# 189160814Ssimon# 190160814Ssimon# The following file works for the xlc,cc 191160814Ssimon# and gcc compilers. 192160814Ssimon# 193160814Ssimon# NOTE: To get the file to link correctly with the gcc compiler 194160814Ssimon# you have to change the names of the routines and remove 195160814Ssimon# the first .(dot) character. This should automatically 196160814Ssimon# be done in the build process. 197160814Ssimon# 198160814Ssimon# Hand optimized assembly code for the following routines 199160814Ssimon# 200160814Ssimon# bn_sqr_comba4 201160814Ssimon# bn_sqr_comba8 202160814Ssimon# bn_mul_comba4 203160814Ssimon# bn_mul_comba8 204160814Ssimon# bn_sub_words 205160814Ssimon# bn_add_words 206160814Ssimon# bn_div_words 207160814Ssimon# bn_sqr_words 208160814Ssimon# bn_mul_words 209160814Ssimon# bn_mul_add_words 210160814Ssimon# 211160814Ssimon# NOTE: It is possible to optimize this code more for 212160814Ssimon# specific PowerPC or Power architectures. On the Northstar 213160814Ssimon# architecture the optimizations in this file do 214160814Ssimon# NOT provide much improvement. 215160814Ssimon# 216160814Ssimon# If you have comments or suggestions to improve code send 217160814Ssimon# me a note at schari\@us.ibm.com 218160814Ssimon# 219160814Ssimon#-------------------------------------------------------------------------- 220160814Ssimon# 221160814Ssimon# Defines to be used in the assembly code. 222160814Ssimon# 223238405Sjkim#.set r0,0 # we use it as storage for value of 0 224238405Sjkim#.set SP,1 # preserved 225238405Sjkim#.set RTOC,2 # preserved 226238405Sjkim#.set r3,3 # 1st argument/return value 227238405Sjkim#.set r4,4 # 2nd argument/volatile register 228238405Sjkim#.set r5,5 # 3rd argument/volatile register 229238405Sjkim#.set r6,6 # ... 230238405Sjkim#.set r7,7 231238405Sjkim#.set r8,8 232238405Sjkim#.set r9,9 233238405Sjkim#.set r10,10 234238405Sjkim#.set r11,11 235238405Sjkim#.set r12,12 236238405Sjkim#.set r13,13 # not used, nor any other "below" it... 237160814Ssimon 238160814Ssimon# Declare function names to be global 239160814Ssimon# NOTE: For gcc these names MUST be changed to remove 240160814Ssimon# the first . i.e. for example change ".bn_sqr_comba4" 241160814Ssimon# to "bn_sqr_comba4". This should be automatically done 242160814Ssimon# in the build. 243160814Ssimon 244160814Ssimon .globl .bn_sqr_comba4 245160814Ssimon .globl .bn_sqr_comba8 246160814Ssimon .globl .bn_mul_comba4 247160814Ssimon .globl .bn_mul_comba8 248160814Ssimon .globl .bn_sub_words 249160814Ssimon .globl .bn_add_words 250160814Ssimon .globl .bn_div_words 251160814Ssimon .globl .bn_sqr_words 252160814Ssimon .globl .bn_mul_words 253160814Ssimon .globl .bn_mul_add_words 254160814Ssimon 255160814Ssimon# .text section 256160814Ssimon 257238405Sjkim .machine "any" 258160814Ssimon 259160814Ssimon# 260160814Ssimon# NOTE: The following label name should be changed to 261160814Ssimon# "bn_sqr_comba4" i.e. remove the first dot 262160814Ssimon# for the gcc compiler. This should be automatically 263160814Ssimon# done in the build 264160814Ssimon# 265160814Ssimon 266160814Ssimon.align 4 267160814Ssimon.bn_sqr_comba4: 268160814Ssimon# 269160814Ssimon# Optimized version of bn_sqr_comba4. 270160814Ssimon# 271160814Ssimon# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 272160814Ssimon# r3 contains r 273160814Ssimon# r4 contains a 274160814Ssimon# 275160814Ssimon# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 276160814Ssimon# 277160814Ssimon# r5,r6 are the two BN_ULONGs being multiplied. 278160814Ssimon# r7,r8 are the results of the 32x32 giving 64 bit multiply. 279160814Ssimon# r9,r10, r11 are the equivalents of c1,c2, c3. 280160814Ssimon# Here's the assembly 281160814Ssimon# 282160814Ssimon# 283160814Ssimon xor r0,r0,r0 # set r0 = 0. Used in the addze 284160814Ssimon # instructions below 285160814Ssimon 286160814Ssimon #sqr_add_c(a,0,c1,c2,c3) 287160814Ssimon $LD r5,`0*$BNSZ`(r4) 288160814Ssimon $UMULL r9,r5,r5 289160814Ssimon $UMULH r10,r5,r5 #in first iteration. No need 290160814Ssimon #to add since c1=c2=c3=0. 291160814Ssimon # Note c3(r11) is NOT set to 0 292160814Ssimon # but will be. 293160814Ssimon 294160814Ssimon $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 295160814Ssimon # sqr_add_c2(a,1,0,c2,c3,c1); 296160814Ssimon $LD r6,`1*$BNSZ`(r4) 297160814Ssimon $UMULL r7,r5,r6 298160814Ssimon $UMULH r8,r5,r6 299160814Ssimon 300160814Ssimon addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) 301160814Ssimon adde r8,r8,r8 302160814Ssimon addze r9,r0 # catch carry if any. 303160814Ssimon # r9= r0(=0) and carry 304160814Ssimon 305160814Ssimon addc r10,r7,r10 # now add to temp result. 306160814Ssimon addze r11,r8 # r8 added to r11 which is 0 307160814Ssimon addze r9,r9 308160814Ssimon 309160814Ssimon $ST r10,`1*$BNSZ`(r3) #r[1]=c2; 310160814Ssimon #sqr_add_c(a,1,c3,c1,c2) 311160814Ssimon $UMULL r7,r6,r6 312160814Ssimon $UMULH r8,r6,r6 313160814Ssimon addc r11,r7,r11 314160814Ssimon adde r9,r8,r9 315160814Ssimon addze r10,r0 316160814Ssimon #sqr_add_c2(a,2,0,c3,c1,c2) 317160814Ssimon $LD r6,`2*$BNSZ`(r4) 318160814Ssimon $UMULL r7,r5,r6 319160814Ssimon $UMULH r8,r5,r6 320160814Ssimon 321160814Ssimon addc r7,r7,r7 322160814Ssimon adde r8,r8,r8 323160814Ssimon addze r10,r10 324160814Ssimon 325160814Ssimon addc r11,r7,r11 326160814Ssimon adde r9,r8,r9 327160814Ssimon addze r10,r10 328160814Ssimon $ST r11,`2*$BNSZ`(r3) #r[2]=c3 329160814Ssimon #sqr_add_c2(a,3,0,c1,c2,c3); 330160814Ssimon $LD r6,`3*$BNSZ`(r4) 331160814Ssimon $UMULL r7,r5,r6 332160814Ssimon $UMULH r8,r5,r6 333160814Ssimon addc r7,r7,r7 334160814Ssimon adde r8,r8,r8 335160814Ssimon addze r11,r0 336160814Ssimon 337160814Ssimon addc r9,r7,r9 338160814Ssimon adde r10,r8,r10 339160814Ssimon addze r11,r11 340160814Ssimon #sqr_add_c2(a,2,1,c1,c2,c3); 341160814Ssimon $LD r5,`1*$BNSZ`(r4) 342160814Ssimon $LD r6,`2*$BNSZ`(r4) 343160814Ssimon $UMULL r7,r5,r6 344160814Ssimon $UMULH r8,r5,r6 345160814Ssimon 346160814Ssimon addc r7,r7,r7 347160814Ssimon adde r8,r8,r8 348160814Ssimon addze r11,r11 349160814Ssimon addc r9,r7,r9 350160814Ssimon adde r10,r8,r10 351160814Ssimon addze r11,r11 352160814Ssimon $ST r9,`3*$BNSZ`(r3) #r[3]=c1 353160814Ssimon #sqr_add_c(a,2,c2,c3,c1); 354160814Ssimon $UMULL r7,r6,r6 355160814Ssimon $UMULH r8,r6,r6 356160814Ssimon addc r10,r7,r10 357160814Ssimon adde r11,r8,r11 358160814Ssimon addze r9,r0 359160814Ssimon #sqr_add_c2(a,3,1,c2,c3,c1); 360160814Ssimon $LD r6,`3*$BNSZ`(r4) 361160814Ssimon $UMULL r7,r5,r6 362160814Ssimon $UMULH r8,r5,r6 363160814Ssimon addc r7,r7,r7 364160814Ssimon adde r8,r8,r8 365160814Ssimon addze r9,r9 366160814Ssimon 367160814Ssimon addc r10,r7,r10 368160814Ssimon adde r11,r8,r11 369160814Ssimon addze r9,r9 370160814Ssimon $ST r10,`4*$BNSZ`(r3) #r[4]=c2 371160814Ssimon #sqr_add_c2(a,3,2,c3,c1,c2); 372160814Ssimon $LD r5,`2*$BNSZ`(r4) 373160814Ssimon $UMULL r7,r5,r6 374160814Ssimon $UMULH r8,r5,r6 375160814Ssimon addc r7,r7,r7 376160814Ssimon adde r8,r8,r8 377160814Ssimon addze r10,r0 378160814Ssimon 379160814Ssimon addc r11,r7,r11 380160814Ssimon adde r9,r8,r9 381160814Ssimon addze r10,r10 382160814Ssimon $ST r11,`5*$BNSZ`(r3) #r[5] = c3 383160814Ssimon #sqr_add_c(a,3,c1,c2,c3); 384160814Ssimon $UMULL r7,r6,r6 385160814Ssimon $UMULH r8,r6,r6 386160814Ssimon addc r9,r7,r9 387160814Ssimon adde r10,r8,r10 388160814Ssimon 389160814Ssimon $ST r9,`6*$BNSZ`(r3) #r[6]=c1 390160814Ssimon $ST r10,`7*$BNSZ`(r3) #r[7]=c2 391238405Sjkim blr 392238405Sjkim .long 0 393238405Sjkim .byte 0,12,0x14,0,0,0,2,0 394238405Sjkim .long 0 395160814Ssimon 396160814Ssimon# 397160814Ssimon# NOTE: The following label name should be changed to 398160814Ssimon# "bn_sqr_comba8" i.e. remove the first dot 399160814Ssimon# for the gcc compiler. This should be automatically 400160814Ssimon# done in the build 401160814Ssimon# 402160814Ssimon 403160814Ssimon.align 4 404160814Ssimon.bn_sqr_comba8: 405160814Ssimon# 406160814Ssimon# This is an optimized version of the bn_sqr_comba8 routine. 407160814Ssimon# Tightly uses the adde instruction 408160814Ssimon# 409160814Ssimon# 410160814Ssimon# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 411160814Ssimon# r3 contains r 412160814Ssimon# r4 contains a 413160814Ssimon# 414160814Ssimon# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 415160814Ssimon# 416160814Ssimon# r5,r6 are the two BN_ULONGs being multiplied. 417160814Ssimon# r7,r8 are the results of the 32x32 giving 64 bit multiply. 418160814Ssimon# r9,r10, r11 are the equivalents of c1,c2, c3. 419160814Ssimon# 420160814Ssimon# Possible optimization of loading all 8 longs of a into registers 421160814Ssimon# doesnt provide any speedup 422160814Ssimon# 423160814Ssimon 424160814Ssimon xor r0,r0,r0 #set r0 = 0.Used in addze 425160814Ssimon #instructions below. 426160814Ssimon 427160814Ssimon #sqr_add_c(a,0,c1,c2,c3); 428160814Ssimon $LD r5,`0*$BNSZ`(r4) 429160814Ssimon $UMULL r9,r5,r5 #1st iteration: no carries. 430160814Ssimon $UMULH r10,r5,r5 431160814Ssimon $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 432160814Ssimon #sqr_add_c2(a,1,0,c2,c3,c1); 433160814Ssimon $LD r6,`1*$BNSZ`(r4) 434160814Ssimon $UMULL r7,r5,r6 435160814Ssimon $UMULH r8,r5,r6 436160814Ssimon 437160814Ssimon addc r10,r7,r10 #add the two register number 438160814Ssimon adde r11,r8,r0 # (r8,r7) to the three register 439160814Ssimon addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 440160814Ssimon 441160814Ssimon addc r10,r7,r10 #add the two register number 442160814Ssimon adde r11,r8,r11 # (r8,r7) to the three register 443160814Ssimon addze r9,r9 # number (r9,r11,r10). 444160814Ssimon 445160814Ssimon $ST r10,`1*$BNSZ`(r3) # r[1]=c2 446160814Ssimon 447160814Ssimon #sqr_add_c(a,1,c3,c1,c2); 448160814Ssimon $UMULL r7,r6,r6 449160814Ssimon $UMULH r8,r6,r6 450160814Ssimon addc r11,r7,r11 451160814Ssimon adde r9,r8,r9 452160814Ssimon addze r10,r0 453160814Ssimon #sqr_add_c2(a,2,0,c3,c1,c2); 454160814Ssimon $LD r6,`2*$BNSZ`(r4) 455160814Ssimon $UMULL r7,r5,r6 456160814Ssimon $UMULH r8,r5,r6 457160814Ssimon 458160814Ssimon addc r11,r7,r11 459160814Ssimon adde r9,r8,r9 460160814Ssimon addze r10,r10 461160814Ssimon 462160814Ssimon addc r11,r7,r11 463160814Ssimon adde r9,r8,r9 464160814Ssimon addze r10,r10 465160814Ssimon 466160814Ssimon $ST r11,`2*$BNSZ`(r3) #r[2]=c3 467160814Ssimon #sqr_add_c2(a,3,0,c1,c2,c3); 468160814Ssimon $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. 469160814Ssimon $UMULL r7,r5,r6 470160814Ssimon $UMULH r8,r5,r6 471160814Ssimon 472160814Ssimon addc r9,r7,r9 473160814Ssimon adde r10,r8,r10 474160814Ssimon addze r11,r0 475160814Ssimon 476160814Ssimon addc r9,r7,r9 477160814Ssimon adde r10,r8,r10 478160814Ssimon addze r11,r11 479160814Ssimon #sqr_add_c2(a,2,1,c1,c2,c3); 480160814Ssimon $LD r5,`1*$BNSZ`(r4) 481160814Ssimon $LD r6,`2*$BNSZ`(r4) 482160814Ssimon $UMULL r7,r5,r6 483160814Ssimon $UMULH r8,r5,r6 484160814Ssimon 485160814Ssimon addc r9,r7,r9 486160814Ssimon adde r10,r8,r10 487160814Ssimon addze r11,r11 488160814Ssimon 489160814Ssimon addc r9,r7,r9 490160814Ssimon adde r10,r8,r10 491160814Ssimon addze r11,r11 492160814Ssimon 493160814Ssimon $ST r9,`3*$BNSZ`(r3) #r[3]=c1; 494160814Ssimon #sqr_add_c(a,2,c2,c3,c1); 495160814Ssimon $UMULL r7,r6,r6 496160814Ssimon $UMULH r8,r6,r6 497160814Ssimon 498160814Ssimon addc r10,r7,r10 499160814Ssimon adde r11,r8,r11 500160814Ssimon addze r9,r0 501160814Ssimon #sqr_add_c2(a,3,1,c2,c3,c1); 502160814Ssimon $LD r6,`3*$BNSZ`(r4) 503160814Ssimon $UMULL r7,r5,r6 504160814Ssimon $UMULH r8,r5,r6 505160814Ssimon 506160814Ssimon addc r10,r7,r10 507160814Ssimon adde r11,r8,r11 508160814Ssimon addze r9,r9 509160814Ssimon 510160814Ssimon addc r10,r7,r10 511160814Ssimon adde r11,r8,r11 512160814Ssimon addze r9,r9 513160814Ssimon #sqr_add_c2(a,4,0,c2,c3,c1); 514160814Ssimon $LD r5,`0*$BNSZ`(r4) 515160814Ssimon $LD r6,`4*$BNSZ`(r4) 516160814Ssimon $UMULL r7,r5,r6 517160814Ssimon $UMULH r8,r5,r6 518160814Ssimon 519160814Ssimon addc r10,r7,r10 520160814Ssimon adde r11,r8,r11 521160814Ssimon addze r9,r9 522160814Ssimon 523160814Ssimon addc r10,r7,r10 524160814Ssimon adde r11,r8,r11 525160814Ssimon addze r9,r9 526160814Ssimon $ST r10,`4*$BNSZ`(r3) #r[4]=c2; 527160814Ssimon #sqr_add_c2(a,5,0,c3,c1,c2); 528160814Ssimon $LD r6,`5*$BNSZ`(r4) 529160814Ssimon $UMULL r7,r5,r6 530160814Ssimon $UMULH r8,r5,r6 531160814Ssimon 532160814Ssimon addc r11,r7,r11 533160814Ssimon adde r9,r8,r9 534160814Ssimon addze r10,r0 535160814Ssimon 536160814Ssimon addc r11,r7,r11 537160814Ssimon adde r9,r8,r9 538160814Ssimon addze r10,r10 539160814Ssimon #sqr_add_c2(a,4,1,c3,c1,c2); 540160814Ssimon $LD r5,`1*$BNSZ`(r4) 541160814Ssimon $LD r6,`4*$BNSZ`(r4) 542160814Ssimon $UMULL r7,r5,r6 543160814Ssimon $UMULH r8,r5,r6 544160814Ssimon 545160814Ssimon addc r11,r7,r11 546160814Ssimon adde r9,r8,r9 547160814Ssimon addze r10,r10 548160814Ssimon 549160814Ssimon addc r11,r7,r11 550160814Ssimon adde r9,r8,r9 551160814Ssimon addze r10,r10 552160814Ssimon #sqr_add_c2(a,3,2,c3,c1,c2); 553160814Ssimon $LD r5,`2*$BNSZ`(r4) 554160814Ssimon $LD r6,`3*$BNSZ`(r4) 555160814Ssimon $UMULL r7,r5,r6 556160814Ssimon $UMULH r8,r5,r6 557160814Ssimon 558160814Ssimon addc r11,r7,r11 559160814Ssimon adde r9,r8,r9 560160814Ssimon addze r10,r10 561160814Ssimon 562160814Ssimon addc r11,r7,r11 563160814Ssimon adde r9,r8,r9 564160814Ssimon addze r10,r10 565160814Ssimon $ST r11,`5*$BNSZ`(r3) #r[5]=c3; 566160814Ssimon #sqr_add_c(a,3,c1,c2,c3); 567160814Ssimon $UMULL r7,r6,r6 568160814Ssimon $UMULH r8,r6,r6 569160814Ssimon addc r9,r7,r9 570160814Ssimon adde r10,r8,r10 571160814Ssimon addze r11,r0 572160814Ssimon #sqr_add_c2(a,4,2,c1,c2,c3); 573160814Ssimon $LD r6,`4*$BNSZ`(r4) 574160814Ssimon $UMULL r7,r5,r6 575160814Ssimon $UMULH r8,r5,r6 576160814Ssimon 577160814Ssimon addc r9,r7,r9 578160814Ssimon adde r10,r8,r10 579160814Ssimon addze r11,r11 580160814Ssimon 581160814Ssimon addc r9,r7,r9 582160814Ssimon adde r10,r8,r10 583160814Ssimon addze r11,r11 584160814Ssimon #sqr_add_c2(a,5,1,c1,c2,c3); 585160814Ssimon $LD r5,`1*$BNSZ`(r4) 586160814Ssimon $LD r6,`5*$BNSZ`(r4) 587160814Ssimon $UMULL r7,r5,r6 588160814Ssimon $UMULH r8,r5,r6 589160814Ssimon 590160814Ssimon addc r9,r7,r9 591160814Ssimon adde r10,r8,r10 592160814Ssimon addze r11,r11 593160814Ssimon 594160814Ssimon addc r9,r7,r9 595160814Ssimon adde r10,r8,r10 596160814Ssimon addze r11,r11 597160814Ssimon #sqr_add_c2(a,6,0,c1,c2,c3); 598160814Ssimon $LD r5,`0*$BNSZ`(r4) 599160814Ssimon $LD r6,`6*$BNSZ`(r4) 600160814Ssimon $UMULL r7,r5,r6 601160814Ssimon $UMULH r8,r5,r6 602160814Ssimon addc r9,r7,r9 603160814Ssimon adde r10,r8,r10 604160814Ssimon addze r11,r11 605160814Ssimon addc r9,r7,r9 606160814Ssimon adde r10,r8,r10 607160814Ssimon addze r11,r11 608160814Ssimon $ST r9,`6*$BNSZ`(r3) #r[6]=c1; 609160814Ssimon #sqr_add_c2(a,7,0,c2,c3,c1); 610160814Ssimon $LD r6,`7*$BNSZ`(r4) 611160814Ssimon $UMULL r7,r5,r6 612160814Ssimon $UMULH r8,r5,r6 613160814Ssimon 614160814Ssimon addc r10,r7,r10 615160814Ssimon adde r11,r8,r11 616160814Ssimon addze r9,r0 617160814Ssimon addc r10,r7,r10 618160814Ssimon adde r11,r8,r11 619160814Ssimon addze r9,r9 620160814Ssimon #sqr_add_c2(a,6,1,c2,c3,c1); 621160814Ssimon $LD r5,`1*$BNSZ`(r4) 622160814Ssimon $LD r6,`6*$BNSZ`(r4) 623160814Ssimon $UMULL r7,r5,r6 624160814Ssimon $UMULH r8,r5,r6 625160814Ssimon 626160814Ssimon addc r10,r7,r10 627160814Ssimon adde r11,r8,r11 628160814Ssimon addze r9,r9 629160814Ssimon addc r10,r7,r10 630160814Ssimon adde r11,r8,r11 631160814Ssimon addze r9,r9 632160814Ssimon #sqr_add_c2(a,5,2,c2,c3,c1); 633160814Ssimon $LD r5,`2*$BNSZ`(r4) 634160814Ssimon $LD r6,`5*$BNSZ`(r4) 635160814Ssimon $UMULL r7,r5,r6 636160814Ssimon $UMULH r8,r5,r6 637160814Ssimon addc r10,r7,r10 638160814Ssimon adde r11,r8,r11 639160814Ssimon addze r9,r9 640160814Ssimon addc r10,r7,r10 641160814Ssimon adde r11,r8,r11 642160814Ssimon addze r9,r9 643160814Ssimon #sqr_add_c2(a,4,3,c2,c3,c1); 644160814Ssimon $LD r5,`3*$BNSZ`(r4) 645160814Ssimon $LD r6,`4*$BNSZ`(r4) 646160814Ssimon $UMULL r7,r5,r6 647160814Ssimon $UMULH r8,r5,r6 648160814Ssimon 649160814Ssimon addc r10,r7,r10 650160814Ssimon adde r11,r8,r11 651160814Ssimon addze r9,r9 652160814Ssimon addc r10,r7,r10 653160814Ssimon adde r11,r8,r11 654160814Ssimon addze r9,r9 655160814Ssimon $ST r10,`7*$BNSZ`(r3) #r[7]=c2; 656160814Ssimon #sqr_add_c(a,4,c3,c1,c2); 657160814Ssimon $UMULL r7,r6,r6 658160814Ssimon $UMULH r8,r6,r6 659160814Ssimon addc r11,r7,r11 660160814Ssimon adde r9,r8,r9 661160814Ssimon addze r10,r0 662160814Ssimon #sqr_add_c2(a,5,3,c3,c1,c2); 663160814Ssimon $LD r6,`5*$BNSZ`(r4) 664160814Ssimon $UMULL r7,r5,r6 665160814Ssimon $UMULH r8,r5,r6 666160814Ssimon addc r11,r7,r11 667160814Ssimon adde r9,r8,r9 668160814Ssimon addze r10,r10 669160814Ssimon addc r11,r7,r11 670160814Ssimon adde r9,r8,r9 671160814Ssimon addze r10,r10 672160814Ssimon #sqr_add_c2(a,6,2,c3,c1,c2); 673160814Ssimon $LD r5,`2*$BNSZ`(r4) 674160814Ssimon $LD r6,`6*$BNSZ`(r4) 675160814Ssimon $UMULL r7,r5,r6 676160814Ssimon $UMULH r8,r5,r6 677160814Ssimon addc r11,r7,r11 678160814Ssimon adde r9,r8,r9 679160814Ssimon addze r10,r10 680160814Ssimon 681160814Ssimon addc r11,r7,r11 682160814Ssimon adde r9,r8,r9 683160814Ssimon addze r10,r10 684160814Ssimon #sqr_add_c2(a,7,1,c3,c1,c2); 685160814Ssimon $LD r5,`1*$BNSZ`(r4) 686160814Ssimon $LD r6,`7*$BNSZ`(r4) 687160814Ssimon $UMULL r7,r5,r6 688160814Ssimon $UMULH r8,r5,r6 689160814Ssimon addc r11,r7,r11 690160814Ssimon adde r9,r8,r9 691160814Ssimon addze r10,r10 692160814Ssimon addc r11,r7,r11 693160814Ssimon adde r9,r8,r9 694160814Ssimon addze r10,r10 695160814Ssimon $ST r11,`8*$BNSZ`(r3) #r[8]=c3; 696160814Ssimon #sqr_add_c2(a,7,2,c1,c2,c3); 697160814Ssimon $LD r5,`2*$BNSZ`(r4) 698160814Ssimon $UMULL r7,r5,r6 699160814Ssimon $UMULH r8,r5,r6 700160814Ssimon 701160814Ssimon addc r9,r7,r9 702160814Ssimon adde r10,r8,r10 703160814Ssimon addze r11,r0 704160814Ssimon addc r9,r7,r9 705160814Ssimon adde r10,r8,r10 706160814Ssimon addze r11,r11 707160814Ssimon #sqr_add_c2(a,6,3,c1,c2,c3); 708160814Ssimon $LD r5,`3*$BNSZ`(r4) 709160814Ssimon $LD r6,`6*$BNSZ`(r4) 710160814Ssimon $UMULL r7,r5,r6 711160814Ssimon $UMULH r8,r5,r6 712160814Ssimon addc r9,r7,r9 713160814Ssimon adde r10,r8,r10 714160814Ssimon addze r11,r11 715160814Ssimon addc r9,r7,r9 716160814Ssimon adde r10,r8,r10 717160814Ssimon addze r11,r11 718160814Ssimon #sqr_add_c2(a,5,4,c1,c2,c3); 719160814Ssimon $LD r5,`4*$BNSZ`(r4) 720160814Ssimon $LD r6,`5*$BNSZ`(r4) 721160814Ssimon $UMULL r7,r5,r6 722160814Ssimon $UMULH r8,r5,r6 723160814Ssimon addc r9,r7,r9 724160814Ssimon adde r10,r8,r10 725160814Ssimon addze r11,r11 726160814Ssimon addc r9,r7,r9 727160814Ssimon adde r10,r8,r10 728160814Ssimon addze r11,r11 729160814Ssimon $ST r9,`9*$BNSZ`(r3) #r[9]=c1; 730160814Ssimon #sqr_add_c(a,5,c2,c3,c1); 731160814Ssimon $UMULL r7,r6,r6 732160814Ssimon $UMULH r8,r6,r6 733160814Ssimon addc r10,r7,r10 734160814Ssimon adde r11,r8,r11 735160814Ssimon addze r9,r0 736160814Ssimon #sqr_add_c2(a,6,4,c2,c3,c1); 737160814Ssimon $LD r6,`6*$BNSZ`(r4) 738160814Ssimon $UMULL r7,r5,r6 739160814Ssimon $UMULH r8,r5,r6 740160814Ssimon addc r10,r7,r10 741160814Ssimon adde r11,r8,r11 742160814Ssimon addze r9,r9 743160814Ssimon addc r10,r7,r10 744160814Ssimon adde r11,r8,r11 745160814Ssimon addze r9,r9 746160814Ssimon #sqr_add_c2(a,7,3,c2,c3,c1); 747160814Ssimon $LD r5,`3*$BNSZ`(r4) 748160814Ssimon $LD r6,`7*$BNSZ`(r4) 749160814Ssimon $UMULL r7,r5,r6 750160814Ssimon $UMULH r8,r5,r6 751160814Ssimon addc r10,r7,r10 752160814Ssimon adde r11,r8,r11 753160814Ssimon addze r9,r9 754160814Ssimon addc r10,r7,r10 755160814Ssimon adde r11,r8,r11 756160814Ssimon addze r9,r9 757160814Ssimon $ST r10,`10*$BNSZ`(r3) #r[10]=c2; 758160814Ssimon #sqr_add_c2(a,7,4,c3,c1,c2); 759160814Ssimon $LD r5,`4*$BNSZ`(r4) 760160814Ssimon $UMULL r7,r5,r6 761160814Ssimon $UMULH r8,r5,r6 762160814Ssimon addc r11,r7,r11 763160814Ssimon adde r9,r8,r9 764160814Ssimon addze r10,r0 765160814Ssimon addc r11,r7,r11 766160814Ssimon adde r9,r8,r9 767160814Ssimon addze r10,r10 768160814Ssimon #sqr_add_c2(a,6,5,c3,c1,c2); 769160814Ssimon $LD r5,`5*$BNSZ`(r4) 770160814Ssimon $LD r6,`6*$BNSZ`(r4) 771160814Ssimon $UMULL r7,r5,r6 772160814Ssimon $UMULH r8,r5,r6 773160814Ssimon addc r11,r7,r11 774160814Ssimon adde r9,r8,r9 775160814Ssimon addze r10,r10 776160814Ssimon addc r11,r7,r11 777160814Ssimon adde r9,r8,r9 778160814Ssimon addze r10,r10 779160814Ssimon $ST r11,`11*$BNSZ`(r3) #r[11]=c3; 780160814Ssimon #sqr_add_c(a,6,c1,c2,c3); 781160814Ssimon $UMULL r7,r6,r6 782160814Ssimon $UMULH r8,r6,r6 783160814Ssimon addc r9,r7,r9 784160814Ssimon adde r10,r8,r10 785160814Ssimon addze r11,r0 786160814Ssimon #sqr_add_c2(a,7,5,c1,c2,c3) 787160814Ssimon $LD r6,`7*$BNSZ`(r4) 788160814Ssimon $UMULL r7,r5,r6 789160814Ssimon $UMULH r8,r5,r6 790160814Ssimon addc r9,r7,r9 791160814Ssimon adde r10,r8,r10 792160814Ssimon addze r11,r11 793160814Ssimon addc r9,r7,r9 794160814Ssimon adde r10,r8,r10 795160814Ssimon addze r11,r11 796160814Ssimon $ST r9,`12*$BNSZ`(r3) #r[12]=c1; 797160814Ssimon 798160814Ssimon #sqr_add_c2(a,7,6,c2,c3,c1) 799160814Ssimon $LD r5,`6*$BNSZ`(r4) 800160814Ssimon $UMULL r7,r5,r6 801160814Ssimon $UMULH r8,r5,r6 802160814Ssimon addc r10,r7,r10 803160814Ssimon adde r11,r8,r11 804160814Ssimon addze r9,r0 805160814Ssimon addc r10,r7,r10 806160814Ssimon adde r11,r8,r11 807160814Ssimon addze r9,r9 808160814Ssimon $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 809160814Ssimon #sqr_add_c(a,7,c3,c1,c2); 810160814Ssimon $UMULL r7,r6,r6 811160814Ssimon $UMULH r8,r6,r6 812160814Ssimon addc r11,r7,r11 813160814Ssimon adde r9,r8,r9 814160814Ssimon $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 815160814Ssimon $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 816160814Ssimon 817160814Ssimon 818238405Sjkim blr 819238405Sjkim .long 0 820238405Sjkim .byte 0,12,0x14,0,0,0,2,0 821238405Sjkim .long 0 822160814Ssimon 823160814Ssimon# 824160814Ssimon# NOTE: The following label name should be changed to 825160814Ssimon# "bn_mul_comba4" i.e. remove the first dot 826160814Ssimon# for the gcc compiler. This should be automatically 827160814Ssimon# done in the build 828160814Ssimon# 829160814Ssimon 830160814Ssimon.align 4 831160814Ssimon.bn_mul_comba4: 832160814Ssimon# 833160814Ssimon# This is an optimized version of the bn_mul_comba4 routine. 834160814Ssimon# 835160814Ssimon# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 836160814Ssimon# r3 contains r 837160814Ssimon# r4 contains a 838160814Ssimon# r5 contains b 839160814Ssimon# r6, r7 are the 2 BN_ULONGs being multiplied. 840160814Ssimon# r8, r9 are the results of the 32x32 giving 64 multiply. 841160814Ssimon# r10, r11, r12 are the equivalents of c1, c2, and c3. 842160814Ssimon# 843160814Ssimon xor r0,r0,r0 #r0=0. Used in addze below. 844160814Ssimon #mul_add_c(a[0],b[0],c1,c2,c3); 845160814Ssimon $LD r6,`0*$BNSZ`(r4) 846160814Ssimon $LD r7,`0*$BNSZ`(r5) 847160814Ssimon $UMULL r10,r6,r7 848160814Ssimon $UMULH r11,r6,r7 849160814Ssimon $ST r10,`0*$BNSZ`(r3) #r[0]=c1 850160814Ssimon #mul_add_c(a[0],b[1],c2,c3,c1); 851160814Ssimon $LD r7,`1*$BNSZ`(r5) 852160814Ssimon $UMULL r8,r6,r7 853160814Ssimon $UMULH r9,r6,r7 854160814Ssimon addc r11,r8,r11 855160814Ssimon adde r12,r9,r0 856160814Ssimon addze r10,r0 857160814Ssimon #mul_add_c(a[1],b[0],c2,c3,c1); 858160814Ssimon $LD r6, `1*$BNSZ`(r4) 859160814Ssimon $LD r7, `0*$BNSZ`(r5) 860160814Ssimon $UMULL r8,r6,r7 861160814Ssimon $UMULH r9,r6,r7 862160814Ssimon addc r11,r8,r11 863160814Ssimon adde r12,r9,r12 864160814Ssimon addze r10,r10 865160814Ssimon $ST r11,`1*$BNSZ`(r3) #r[1]=c2 866160814Ssimon #mul_add_c(a[2],b[0],c3,c1,c2); 867160814Ssimon $LD r6,`2*$BNSZ`(r4) 868160814Ssimon $UMULL r8,r6,r7 869160814Ssimon $UMULH r9,r6,r7 870160814Ssimon addc r12,r8,r12 871160814Ssimon adde r10,r9,r10 872160814Ssimon addze r11,r0 873160814Ssimon #mul_add_c(a[1],b[1],c3,c1,c2); 874160814Ssimon $LD r6,`1*$BNSZ`(r4) 875160814Ssimon $LD r7,`1*$BNSZ`(r5) 876160814Ssimon $UMULL r8,r6,r7 877160814Ssimon $UMULH r9,r6,r7 878160814Ssimon addc r12,r8,r12 879160814Ssimon adde r10,r9,r10 880160814Ssimon addze r11,r11 881160814Ssimon #mul_add_c(a[0],b[2],c3,c1,c2); 882160814Ssimon $LD r6,`0*$BNSZ`(r4) 883160814Ssimon $LD r7,`2*$BNSZ`(r5) 884160814Ssimon $UMULL r8,r6,r7 885160814Ssimon $UMULH r9,r6,r7 886160814Ssimon addc r12,r8,r12 887160814Ssimon adde r10,r9,r10 888160814Ssimon addze r11,r11 889160814Ssimon $ST r12,`2*$BNSZ`(r3) #r[2]=c3 890160814Ssimon #mul_add_c(a[0],b[3],c1,c2,c3); 891160814Ssimon $LD r7,`3*$BNSZ`(r5) 892160814Ssimon $UMULL r8,r6,r7 893160814Ssimon $UMULH r9,r6,r7 894160814Ssimon addc r10,r8,r10 895160814Ssimon adde r11,r9,r11 896160814Ssimon addze r12,r0 897160814Ssimon #mul_add_c(a[1],b[2],c1,c2,c3); 898160814Ssimon $LD r6,`1*$BNSZ`(r4) 899160814Ssimon $LD r7,`2*$BNSZ`(r5) 900160814Ssimon $UMULL r8,r6,r7 901160814Ssimon $UMULH r9,r6,r7 902160814Ssimon addc r10,r8,r10 903160814Ssimon adde r11,r9,r11 904160814Ssimon addze r12,r12 905160814Ssimon #mul_add_c(a[2],b[1],c1,c2,c3); 906160814Ssimon $LD r6,`2*$BNSZ`(r4) 907160814Ssimon $LD r7,`1*$BNSZ`(r5) 908160814Ssimon $UMULL r8,r6,r7 909160814Ssimon $UMULH r9,r6,r7 910160814Ssimon addc r10,r8,r10 911160814Ssimon adde r11,r9,r11 912160814Ssimon addze r12,r12 913160814Ssimon #mul_add_c(a[3],b[0],c1,c2,c3); 914160814Ssimon $LD r6,`3*$BNSZ`(r4) 915160814Ssimon $LD r7,`0*$BNSZ`(r5) 916160814Ssimon $UMULL r8,r6,r7 917160814Ssimon $UMULH r9,r6,r7 918160814Ssimon addc r10,r8,r10 919160814Ssimon adde r11,r9,r11 920160814Ssimon addze r12,r12 921160814Ssimon $ST r10,`3*$BNSZ`(r3) #r[3]=c1 922160814Ssimon #mul_add_c(a[3],b[1],c2,c3,c1); 923160814Ssimon $LD r7,`1*$BNSZ`(r5) 924160814Ssimon $UMULL r8,r6,r7 925160814Ssimon $UMULH r9,r6,r7 926160814Ssimon addc r11,r8,r11 927160814Ssimon adde r12,r9,r12 928160814Ssimon addze r10,r0 929160814Ssimon #mul_add_c(a[2],b[2],c2,c3,c1); 930160814Ssimon $LD r6,`2*$BNSZ`(r4) 931160814Ssimon $LD r7,`2*$BNSZ`(r5) 932160814Ssimon $UMULL r8,r6,r7 933160814Ssimon $UMULH r9,r6,r7 934160814Ssimon addc r11,r8,r11 935160814Ssimon adde r12,r9,r12 936160814Ssimon addze r10,r10 937160814Ssimon #mul_add_c(a[1],b[3],c2,c3,c1); 938160814Ssimon $LD r6,`1*$BNSZ`(r4) 939160814Ssimon $LD r7,`3*$BNSZ`(r5) 940160814Ssimon $UMULL r8,r6,r7 941160814Ssimon $UMULH r9,r6,r7 942160814Ssimon addc r11,r8,r11 943160814Ssimon adde r12,r9,r12 944160814Ssimon addze r10,r10 945160814Ssimon $ST r11,`4*$BNSZ`(r3) #r[4]=c2 946160814Ssimon #mul_add_c(a[2],b[3],c3,c1,c2); 947160814Ssimon $LD r6,`2*$BNSZ`(r4) 948160814Ssimon $UMULL r8,r6,r7 949160814Ssimon $UMULH r9,r6,r7 950160814Ssimon addc r12,r8,r12 951160814Ssimon adde r10,r9,r10 952160814Ssimon addze r11,r0 953160814Ssimon #mul_add_c(a[3],b[2],c3,c1,c2); 954160814Ssimon $LD r6,`3*$BNSZ`(r4) 955237657Sjkim $LD r7,`2*$BNSZ`(r5) 956160814Ssimon $UMULL r8,r6,r7 957160814Ssimon $UMULH r9,r6,r7 958160814Ssimon addc r12,r8,r12 959160814Ssimon adde r10,r9,r10 960160814Ssimon addze r11,r11 961160814Ssimon $ST r12,`5*$BNSZ`(r3) #r[5]=c3 962160814Ssimon #mul_add_c(a[3],b[3],c1,c2,c3); 963160814Ssimon $LD r7,`3*$BNSZ`(r5) 964160814Ssimon $UMULL r8,r6,r7 965160814Ssimon $UMULH r9,r6,r7 966160814Ssimon addc r10,r8,r10 967160814Ssimon adde r11,r9,r11 968160814Ssimon 969160814Ssimon $ST r10,`6*$BNSZ`(r3) #r[6]=c1 970160814Ssimon $ST r11,`7*$BNSZ`(r3) #r[7]=c2 971238405Sjkim blr 972238405Sjkim .long 0 973238405Sjkim .byte 0,12,0x14,0,0,0,3,0 974238405Sjkim .long 0 975160814Ssimon 976160814Ssimon# 977160814Ssimon# NOTE: The following label name should be changed to 978160814Ssimon# "bn_mul_comba8" i.e. remove the first dot 979160814Ssimon# for the gcc compiler. This should be automatically 980160814Ssimon# done in the build 981160814Ssimon# 982160814Ssimon 983160814Ssimon.align 4 984160814Ssimon.bn_mul_comba8: 985160814Ssimon# 986160814Ssimon# Optimized version of the bn_mul_comba8 routine. 987160814Ssimon# 988160814Ssimon# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 989160814Ssimon# r3 contains r 990160814Ssimon# r4 contains a 991160814Ssimon# r5 contains b 992160814Ssimon# r6, r7 are the 2 BN_ULONGs being multiplied. 993160814Ssimon# r8, r9 are the results of the 32x32 giving 64 multiply. 994160814Ssimon# r10, r11, r12 are the equivalents of c1, c2, and c3. 995160814Ssimon# 996160814Ssimon xor r0,r0,r0 #r0=0. Used in addze below. 997160814Ssimon 998160814Ssimon #mul_add_c(a[0],b[0],c1,c2,c3); 999160814Ssimon $LD r6,`0*$BNSZ`(r4) #a[0] 1000160814Ssimon $LD r7,`0*$BNSZ`(r5) #b[0] 1001160814Ssimon $UMULL r10,r6,r7 1002160814Ssimon $UMULH r11,r6,r7 1003160814Ssimon $ST r10,`0*$BNSZ`(r3) #r[0]=c1; 1004160814Ssimon #mul_add_c(a[0],b[1],c2,c3,c1); 1005160814Ssimon $LD r7,`1*$BNSZ`(r5) 1006160814Ssimon $UMULL r8,r6,r7 1007160814Ssimon $UMULH r9,r6,r7 1008160814Ssimon addc r11,r11,r8 1009160814Ssimon addze r12,r9 # since we didnt set r12 to zero before. 1010160814Ssimon addze r10,r0 1011160814Ssimon #mul_add_c(a[1],b[0],c2,c3,c1); 1012160814Ssimon $LD r6,`1*$BNSZ`(r4) 1013160814Ssimon $LD r7,`0*$BNSZ`(r5) 1014160814Ssimon $UMULL r8,r6,r7 1015160814Ssimon $UMULH r9,r6,r7 1016160814Ssimon addc r11,r11,r8 1017160814Ssimon adde r12,r12,r9 1018160814Ssimon addze r10,r10 1019160814Ssimon $ST r11,`1*$BNSZ`(r3) #r[1]=c2; 1020160814Ssimon #mul_add_c(a[2],b[0],c3,c1,c2); 1021160814Ssimon $LD r6,`2*$BNSZ`(r4) 1022160814Ssimon $UMULL r8,r6,r7 1023160814Ssimon $UMULH r9,r6,r7 1024160814Ssimon addc r12,r12,r8 1025160814Ssimon adde r10,r10,r9 1026160814Ssimon addze r11,r0 1027160814Ssimon #mul_add_c(a[1],b[1],c3,c1,c2); 1028160814Ssimon $LD r6,`1*$BNSZ`(r4) 1029160814Ssimon $LD r7,`1*$BNSZ`(r5) 1030160814Ssimon $UMULL r8,r6,r7 1031160814Ssimon $UMULH r9,r6,r7 1032160814Ssimon addc r12,r12,r8 1033160814Ssimon adde r10,r10,r9 1034160814Ssimon addze r11,r11 1035160814Ssimon #mul_add_c(a[0],b[2],c3,c1,c2); 1036160814Ssimon $LD r6,`0*$BNSZ`(r4) 1037160814Ssimon $LD r7,`2*$BNSZ`(r5) 1038160814Ssimon $UMULL r8,r6,r7 1039160814Ssimon $UMULH r9,r6,r7 1040160814Ssimon addc r12,r12,r8 1041160814Ssimon adde r10,r10,r9 1042160814Ssimon addze r11,r11 1043160814Ssimon $ST r12,`2*$BNSZ`(r3) #r[2]=c3; 1044160814Ssimon #mul_add_c(a[0],b[3],c1,c2,c3); 1045160814Ssimon $LD r7,`3*$BNSZ`(r5) 1046160814Ssimon $UMULL r8,r6,r7 1047160814Ssimon $UMULH r9,r6,r7 1048160814Ssimon addc r10,r10,r8 1049160814Ssimon adde r11,r11,r9 1050160814Ssimon addze r12,r0 1051160814Ssimon #mul_add_c(a[1],b[2],c1,c2,c3); 1052160814Ssimon $LD r6,`1*$BNSZ`(r4) 1053160814Ssimon $LD r7,`2*$BNSZ`(r5) 1054160814Ssimon $UMULL r8,r6,r7 1055160814Ssimon $UMULH r9,r6,r7 1056160814Ssimon addc r10,r10,r8 1057160814Ssimon adde r11,r11,r9 1058160814Ssimon addze r12,r12 1059160814Ssimon 1060160814Ssimon #mul_add_c(a[2],b[1],c1,c2,c3); 1061160814Ssimon $LD r6,`2*$BNSZ`(r4) 1062160814Ssimon $LD r7,`1*$BNSZ`(r5) 1063160814Ssimon $UMULL r8,r6,r7 1064160814Ssimon $UMULH r9,r6,r7 1065160814Ssimon addc r10,r10,r8 1066160814Ssimon adde r11,r11,r9 1067160814Ssimon addze r12,r12 1068160814Ssimon #mul_add_c(a[3],b[0],c1,c2,c3); 1069160814Ssimon $LD r6,`3*$BNSZ`(r4) 1070160814Ssimon $LD r7,`0*$BNSZ`(r5) 1071160814Ssimon $UMULL r8,r6,r7 1072160814Ssimon $UMULH r9,r6,r7 1073160814Ssimon addc r10,r10,r8 1074160814Ssimon adde r11,r11,r9 1075160814Ssimon addze r12,r12 1076160814Ssimon $ST r10,`3*$BNSZ`(r3) #r[3]=c1; 1077160814Ssimon #mul_add_c(a[4],b[0],c2,c3,c1); 1078160814Ssimon $LD r6,`4*$BNSZ`(r4) 1079160814Ssimon $UMULL r8,r6,r7 1080160814Ssimon $UMULH r9,r6,r7 1081160814Ssimon addc r11,r11,r8 1082160814Ssimon adde r12,r12,r9 1083160814Ssimon addze r10,r0 1084160814Ssimon #mul_add_c(a[3],b[1],c2,c3,c1); 1085160814Ssimon $LD r6,`3*$BNSZ`(r4) 1086160814Ssimon $LD r7,`1*$BNSZ`(r5) 1087160814Ssimon $UMULL r8,r6,r7 1088160814Ssimon $UMULH r9,r6,r7 1089160814Ssimon addc r11,r11,r8 1090160814Ssimon adde r12,r12,r9 1091160814Ssimon addze r10,r10 1092160814Ssimon #mul_add_c(a[2],b[2],c2,c3,c1); 1093160814Ssimon $LD r6,`2*$BNSZ`(r4) 1094160814Ssimon $LD r7,`2*$BNSZ`(r5) 1095160814Ssimon $UMULL r8,r6,r7 1096160814Ssimon $UMULH r9,r6,r7 1097160814Ssimon addc r11,r11,r8 1098160814Ssimon adde r12,r12,r9 1099160814Ssimon addze r10,r10 1100160814Ssimon #mul_add_c(a[1],b[3],c2,c3,c1); 1101160814Ssimon $LD r6,`1*$BNSZ`(r4) 1102160814Ssimon $LD r7,`3*$BNSZ`(r5) 1103160814Ssimon $UMULL r8,r6,r7 1104160814Ssimon $UMULH r9,r6,r7 1105160814Ssimon addc r11,r11,r8 1106160814Ssimon adde r12,r12,r9 1107160814Ssimon addze r10,r10 1108160814Ssimon #mul_add_c(a[0],b[4],c2,c3,c1); 1109160814Ssimon $LD r6,`0*$BNSZ`(r4) 1110160814Ssimon $LD r7,`4*$BNSZ`(r5) 1111160814Ssimon $UMULL r8,r6,r7 1112160814Ssimon $UMULH r9,r6,r7 1113160814Ssimon addc r11,r11,r8 1114160814Ssimon adde r12,r12,r9 1115160814Ssimon addze r10,r10 1116160814Ssimon $ST r11,`4*$BNSZ`(r3) #r[4]=c2; 1117160814Ssimon #mul_add_c(a[0],b[5],c3,c1,c2); 1118160814Ssimon $LD r7,`5*$BNSZ`(r5) 1119160814Ssimon $UMULL r8,r6,r7 1120160814Ssimon $UMULH r9,r6,r7 1121160814Ssimon addc r12,r12,r8 1122160814Ssimon adde r10,r10,r9 1123160814Ssimon addze r11,r0 1124160814Ssimon #mul_add_c(a[1],b[4],c3,c1,c2); 1125160814Ssimon $LD r6,`1*$BNSZ`(r4) 1126160814Ssimon $LD r7,`4*$BNSZ`(r5) 1127160814Ssimon $UMULL r8,r6,r7 1128160814Ssimon $UMULH r9,r6,r7 1129160814Ssimon addc r12,r12,r8 1130160814Ssimon adde r10,r10,r9 1131160814Ssimon addze r11,r11 1132160814Ssimon #mul_add_c(a[2],b[3],c3,c1,c2); 1133160814Ssimon $LD r6,`2*$BNSZ`(r4) 1134160814Ssimon $LD r7,`3*$BNSZ`(r5) 1135160814Ssimon $UMULL r8,r6,r7 1136160814Ssimon $UMULH r9,r6,r7 1137160814Ssimon addc r12,r12,r8 1138160814Ssimon adde r10,r10,r9 1139160814Ssimon addze r11,r11 1140160814Ssimon #mul_add_c(a[3],b[2],c3,c1,c2); 1141160814Ssimon $LD r6,`3*$BNSZ`(r4) 1142160814Ssimon $LD r7,`2*$BNSZ`(r5) 1143160814Ssimon $UMULL r8,r6,r7 1144160814Ssimon $UMULH r9,r6,r7 1145160814Ssimon addc r12,r12,r8 1146160814Ssimon adde r10,r10,r9 1147160814Ssimon addze r11,r11 1148160814Ssimon #mul_add_c(a[4],b[1],c3,c1,c2); 1149160814Ssimon $LD r6,`4*$BNSZ`(r4) 1150160814Ssimon $LD r7,`1*$BNSZ`(r5) 1151160814Ssimon $UMULL r8,r6,r7 1152160814Ssimon $UMULH r9,r6,r7 1153160814Ssimon addc r12,r12,r8 1154160814Ssimon adde r10,r10,r9 1155160814Ssimon addze r11,r11 1156160814Ssimon #mul_add_c(a[5],b[0],c3,c1,c2); 1157160814Ssimon $LD r6,`5*$BNSZ`(r4) 1158160814Ssimon $LD r7,`0*$BNSZ`(r5) 1159160814Ssimon $UMULL r8,r6,r7 1160160814Ssimon $UMULH r9,r6,r7 1161160814Ssimon addc r12,r12,r8 1162160814Ssimon adde r10,r10,r9 1163160814Ssimon addze r11,r11 1164160814Ssimon $ST r12,`5*$BNSZ`(r3) #r[5]=c3; 1165160814Ssimon #mul_add_c(a[6],b[0],c1,c2,c3); 1166160814Ssimon $LD r6,`6*$BNSZ`(r4) 1167160814Ssimon $UMULL r8,r6,r7 1168160814Ssimon $UMULH r9,r6,r7 1169160814Ssimon addc r10,r10,r8 1170160814Ssimon adde r11,r11,r9 1171160814Ssimon addze r12,r0 1172160814Ssimon #mul_add_c(a[5],b[1],c1,c2,c3); 1173160814Ssimon $LD r6,`5*$BNSZ`(r4) 1174160814Ssimon $LD r7,`1*$BNSZ`(r5) 1175160814Ssimon $UMULL r8,r6,r7 1176160814Ssimon $UMULH r9,r6,r7 1177160814Ssimon addc r10,r10,r8 1178160814Ssimon adde r11,r11,r9 1179160814Ssimon addze r12,r12 1180160814Ssimon #mul_add_c(a[4],b[2],c1,c2,c3); 1181160814Ssimon $LD r6,`4*$BNSZ`(r4) 1182160814Ssimon $LD r7,`2*$BNSZ`(r5) 1183160814Ssimon $UMULL r8,r6,r7 1184160814Ssimon $UMULH r9,r6,r7 1185160814Ssimon addc r10,r10,r8 1186160814Ssimon adde r11,r11,r9 1187160814Ssimon addze r12,r12 1188160814Ssimon #mul_add_c(a[3],b[3],c1,c2,c3); 1189160814Ssimon $LD r6,`3*$BNSZ`(r4) 1190160814Ssimon $LD r7,`3*$BNSZ`(r5) 1191160814Ssimon $UMULL r8,r6,r7 1192160814Ssimon $UMULH r9,r6,r7 1193160814Ssimon addc r10,r10,r8 1194160814Ssimon adde r11,r11,r9 1195160814Ssimon addze r12,r12 1196160814Ssimon #mul_add_c(a[2],b[4],c1,c2,c3); 1197160814Ssimon $LD r6,`2*$BNSZ`(r4) 1198160814Ssimon $LD r7,`4*$BNSZ`(r5) 1199160814Ssimon $UMULL r8,r6,r7 1200160814Ssimon $UMULH r9,r6,r7 1201160814Ssimon addc r10,r10,r8 1202160814Ssimon adde r11,r11,r9 1203160814Ssimon addze r12,r12 1204160814Ssimon #mul_add_c(a[1],b[5],c1,c2,c3); 1205160814Ssimon $LD r6,`1*$BNSZ`(r4) 1206160814Ssimon $LD r7,`5*$BNSZ`(r5) 1207160814Ssimon $UMULL r8,r6,r7 1208160814Ssimon $UMULH r9,r6,r7 1209160814Ssimon addc r10,r10,r8 1210160814Ssimon adde r11,r11,r9 1211160814Ssimon addze r12,r12 1212160814Ssimon #mul_add_c(a[0],b[6],c1,c2,c3); 1213160814Ssimon $LD r6,`0*$BNSZ`(r4) 1214160814Ssimon $LD r7,`6*$BNSZ`(r5) 1215160814Ssimon $UMULL r8,r6,r7 1216160814Ssimon $UMULH r9,r6,r7 1217160814Ssimon addc r10,r10,r8 1218160814Ssimon adde r11,r11,r9 1219160814Ssimon addze r12,r12 1220160814Ssimon $ST r10,`6*$BNSZ`(r3) #r[6]=c1; 1221160814Ssimon #mul_add_c(a[0],b[7],c2,c3,c1); 1222160814Ssimon $LD r7,`7*$BNSZ`(r5) 1223160814Ssimon $UMULL r8,r6,r7 1224160814Ssimon $UMULH r9,r6,r7 1225160814Ssimon addc r11,r11,r8 1226160814Ssimon adde r12,r12,r9 1227160814Ssimon addze r10,r0 1228160814Ssimon #mul_add_c(a[1],b[6],c2,c3,c1); 1229160814Ssimon $LD r6,`1*$BNSZ`(r4) 1230160814Ssimon $LD r7,`6*$BNSZ`(r5) 1231160814Ssimon $UMULL r8,r6,r7 1232160814Ssimon $UMULH r9,r6,r7 1233160814Ssimon addc r11,r11,r8 1234160814Ssimon adde r12,r12,r9 1235160814Ssimon addze r10,r10 1236160814Ssimon #mul_add_c(a[2],b[5],c2,c3,c1); 1237160814Ssimon $LD r6,`2*$BNSZ`(r4) 1238160814Ssimon $LD r7,`5*$BNSZ`(r5) 1239160814Ssimon $UMULL r8,r6,r7 1240160814Ssimon $UMULH r9,r6,r7 1241160814Ssimon addc r11,r11,r8 1242160814Ssimon adde r12,r12,r9 1243160814Ssimon addze r10,r10 1244160814Ssimon #mul_add_c(a[3],b[4],c2,c3,c1); 1245160814Ssimon $LD r6,`3*$BNSZ`(r4) 1246160814Ssimon $LD r7,`4*$BNSZ`(r5) 1247160814Ssimon $UMULL r8,r6,r7 1248160814Ssimon $UMULH r9,r6,r7 1249160814Ssimon addc r11,r11,r8 1250160814Ssimon adde r12,r12,r9 1251160814Ssimon addze r10,r10 1252160814Ssimon #mul_add_c(a[4],b[3],c2,c3,c1); 1253160814Ssimon $LD r6,`4*$BNSZ`(r4) 1254160814Ssimon $LD r7,`3*$BNSZ`(r5) 1255160814Ssimon $UMULL r8,r6,r7 1256160814Ssimon $UMULH r9,r6,r7 1257160814Ssimon addc r11,r11,r8 1258160814Ssimon adde r12,r12,r9 1259160814Ssimon addze r10,r10 1260160814Ssimon #mul_add_c(a[5],b[2],c2,c3,c1); 1261160814Ssimon $LD r6,`5*$BNSZ`(r4) 1262160814Ssimon $LD r7,`2*$BNSZ`(r5) 1263160814Ssimon $UMULL r8,r6,r7 1264160814Ssimon $UMULH r9,r6,r7 1265160814Ssimon addc r11,r11,r8 1266160814Ssimon adde r12,r12,r9 1267160814Ssimon addze r10,r10 1268160814Ssimon #mul_add_c(a[6],b[1],c2,c3,c1); 1269160814Ssimon $LD r6,`6*$BNSZ`(r4) 1270160814Ssimon $LD r7,`1*$BNSZ`(r5) 1271160814Ssimon $UMULL r8,r6,r7 1272160814Ssimon $UMULH r9,r6,r7 1273160814Ssimon addc r11,r11,r8 1274160814Ssimon adde r12,r12,r9 1275160814Ssimon addze r10,r10 1276160814Ssimon #mul_add_c(a[7],b[0],c2,c3,c1); 1277160814Ssimon $LD r6,`7*$BNSZ`(r4) 1278160814Ssimon $LD r7,`0*$BNSZ`(r5) 1279160814Ssimon $UMULL r8,r6,r7 1280160814Ssimon $UMULH r9,r6,r7 1281160814Ssimon addc r11,r11,r8 1282160814Ssimon adde r12,r12,r9 1283160814Ssimon addze r10,r10 1284160814Ssimon $ST r11,`7*$BNSZ`(r3) #r[7]=c2; 1285160814Ssimon #mul_add_c(a[7],b[1],c3,c1,c2); 1286160814Ssimon $LD r7,`1*$BNSZ`(r5) 1287160814Ssimon $UMULL r8,r6,r7 1288160814Ssimon $UMULH r9,r6,r7 1289160814Ssimon addc r12,r12,r8 1290160814Ssimon adde r10,r10,r9 1291160814Ssimon addze r11,r0 1292160814Ssimon #mul_add_c(a[6],b[2],c3,c1,c2); 1293160814Ssimon $LD r6,`6*$BNSZ`(r4) 1294160814Ssimon $LD r7,`2*$BNSZ`(r5) 1295160814Ssimon $UMULL r8,r6,r7 1296160814Ssimon $UMULH r9,r6,r7 1297160814Ssimon addc r12,r12,r8 1298160814Ssimon adde r10,r10,r9 1299160814Ssimon addze r11,r11 1300160814Ssimon #mul_add_c(a[5],b[3],c3,c1,c2); 1301160814Ssimon $LD r6,`5*$BNSZ`(r4) 1302160814Ssimon $LD r7,`3*$BNSZ`(r5) 1303160814Ssimon $UMULL r8,r6,r7 1304160814Ssimon $UMULH r9,r6,r7 1305160814Ssimon addc r12,r12,r8 1306160814Ssimon adde r10,r10,r9 1307160814Ssimon addze r11,r11 1308160814Ssimon #mul_add_c(a[4],b[4],c3,c1,c2); 1309160814Ssimon $LD r6,`4*$BNSZ`(r4) 1310160814Ssimon $LD r7,`4*$BNSZ`(r5) 1311160814Ssimon $UMULL r8,r6,r7 1312160814Ssimon $UMULH r9,r6,r7 1313160814Ssimon addc r12,r12,r8 1314160814Ssimon adde r10,r10,r9 1315160814Ssimon addze r11,r11 1316160814Ssimon #mul_add_c(a[3],b[5],c3,c1,c2); 1317160814Ssimon $LD r6,`3*$BNSZ`(r4) 1318160814Ssimon $LD r7,`5*$BNSZ`(r5) 1319160814Ssimon $UMULL r8,r6,r7 1320160814Ssimon $UMULH r9,r6,r7 1321160814Ssimon addc r12,r12,r8 1322160814Ssimon adde r10,r10,r9 1323160814Ssimon addze r11,r11 1324160814Ssimon #mul_add_c(a[2],b[6],c3,c1,c2); 1325160814Ssimon $LD r6,`2*$BNSZ`(r4) 1326160814Ssimon $LD r7,`6*$BNSZ`(r5) 1327160814Ssimon $UMULL r8,r6,r7 1328160814Ssimon $UMULH r9,r6,r7 1329160814Ssimon addc r12,r12,r8 1330160814Ssimon adde r10,r10,r9 1331160814Ssimon addze r11,r11 1332160814Ssimon #mul_add_c(a[1],b[7],c3,c1,c2); 1333160814Ssimon $LD r6,`1*$BNSZ`(r4) 1334160814Ssimon $LD r7,`7*$BNSZ`(r5) 1335160814Ssimon $UMULL r8,r6,r7 1336160814Ssimon $UMULH r9,r6,r7 1337160814Ssimon addc r12,r12,r8 1338160814Ssimon adde r10,r10,r9 1339160814Ssimon addze r11,r11 1340160814Ssimon $ST r12,`8*$BNSZ`(r3) #r[8]=c3; 1341160814Ssimon #mul_add_c(a[2],b[7],c1,c2,c3); 1342160814Ssimon $LD r6,`2*$BNSZ`(r4) 1343160814Ssimon $UMULL r8,r6,r7 1344160814Ssimon $UMULH r9,r6,r7 1345160814Ssimon addc r10,r10,r8 1346160814Ssimon adde r11,r11,r9 1347160814Ssimon addze r12,r0 1348160814Ssimon #mul_add_c(a[3],b[6],c1,c2,c3); 1349160814Ssimon $LD r6,`3*$BNSZ`(r4) 1350160814Ssimon $LD r7,`6*$BNSZ`(r5) 1351160814Ssimon $UMULL r8,r6,r7 1352160814Ssimon $UMULH r9,r6,r7 1353160814Ssimon addc r10,r10,r8 1354160814Ssimon adde r11,r11,r9 1355160814Ssimon addze r12,r12 1356160814Ssimon #mul_add_c(a[4],b[5],c1,c2,c3); 1357160814Ssimon $LD r6,`4*$BNSZ`(r4) 1358160814Ssimon $LD r7,`5*$BNSZ`(r5) 1359160814Ssimon $UMULL r8,r6,r7 1360160814Ssimon $UMULH r9,r6,r7 1361160814Ssimon addc r10,r10,r8 1362160814Ssimon adde r11,r11,r9 1363160814Ssimon addze r12,r12 1364160814Ssimon #mul_add_c(a[5],b[4],c1,c2,c3); 1365160814Ssimon $LD r6,`5*$BNSZ`(r4) 1366160814Ssimon $LD r7,`4*$BNSZ`(r5) 1367160814Ssimon $UMULL r8,r6,r7 1368160814Ssimon $UMULH r9,r6,r7 1369160814Ssimon addc r10,r10,r8 1370160814Ssimon adde r11,r11,r9 1371160814Ssimon addze r12,r12 1372160814Ssimon #mul_add_c(a[6],b[3],c1,c2,c3); 1373160814Ssimon $LD r6,`6*$BNSZ`(r4) 1374160814Ssimon $LD r7,`3*$BNSZ`(r5) 1375160814Ssimon $UMULL r8,r6,r7 1376160814Ssimon $UMULH r9,r6,r7 1377160814Ssimon addc r10,r10,r8 1378160814Ssimon adde r11,r11,r9 1379160814Ssimon addze r12,r12 1380160814Ssimon #mul_add_c(a[7],b[2],c1,c2,c3); 1381160814Ssimon $LD r6,`7*$BNSZ`(r4) 1382160814Ssimon $LD r7,`2*$BNSZ`(r5) 1383160814Ssimon $UMULL r8,r6,r7 1384160814Ssimon $UMULH r9,r6,r7 1385160814Ssimon addc r10,r10,r8 1386160814Ssimon adde r11,r11,r9 1387160814Ssimon addze r12,r12 1388160814Ssimon $ST r10,`9*$BNSZ`(r3) #r[9]=c1; 1389160814Ssimon #mul_add_c(a[7],b[3],c2,c3,c1); 1390160814Ssimon $LD r7,`3*$BNSZ`(r5) 1391160814Ssimon $UMULL r8,r6,r7 1392160814Ssimon $UMULH r9,r6,r7 1393160814Ssimon addc r11,r11,r8 1394160814Ssimon adde r12,r12,r9 1395160814Ssimon addze r10,r0 1396160814Ssimon #mul_add_c(a[6],b[4],c2,c3,c1); 1397160814Ssimon $LD r6,`6*$BNSZ`(r4) 1398160814Ssimon $LD r7,`4*$BNSZ`(r5) 1399160814Ssimon $UMULL r8,r6,r7 1400160814Ssimon $UMULH r9,r6,r7 1401160814Ssimon addc r11,r11,r8 1402160814Ssimon adde r12,r12,r9 1403160814Ssimon addze r10,r10 1404160814Ssimon #mul_add_c(a[5],b[5],c2,c3,c1); 1405160814Ssimon $LD r6,`5*$BNSZ`(r4) 1406160814Ssimon $LD r7,`5*$BNSZ`(r5) 1407160814Ssimon $UMULL r8,r6,r7 1408160814Ssimon $UMULH r9,r6,r7 1409160814Ssimon addc r11,r11,r8 1410160814Ssimon adde r12,r12,r9 1411160814Ssimon addze r10,r10 1412160814Ssimon #mul_add_c(a[4],b[6],c2,c3,c1); 1413160814Ssimon $LD r6,`4*$BNSZ`(r4) 1414160814Ssimon $LD r7,`6*$BNSZ`(r5) 1415160814Ssimon $UMULL r8,r6,r7 1416160814Ssimon $UMULH r9,r6,r7 1417160814Ssimon addc r11,r11,r8 1418160814Ssimon adde r12,r12,r9 1419160814Ssimon addze r10,r10 1420160814Ssimon #mul_add_c(a[3],b[7],c2,c3,c1); 1421160814Ssimon $LD r6,`3*$BNSZ`(r4) 1422160814Ssimon $LD r7,`7*$BNSZ`(r5) 1423160814Ssimon $UMULL r8,r6,r7 1424160814Ssimon $UMULH r9,r6,r7 1425160814Ssimon addc r11,r11,r8 1426160814Ssimon adde r12,r12,r9 1427160814Ssimon addze r10,r10 1428160814Ssimon $ST r11,`10*$BNSZ`(r3) #r[10]=c2; 1429160814Ssimon #mul_add_c(a[4],b[7],c3,c1,c2); 1430160814Ssimon $LD r6,`4*$BNSZ`(r4) 1431160814Ssimon $UMULL r8,r6,r7 1432160814Ssimon $UMULH r9,r6,r7 1433160814Ssimon addc r12,r12,r8 1434160814Ssimon adde r10,r10,r9 1435160814Ssimon addze r11,r0 1436160814Ssimon #mul_add_c(a[5],b[6],c3,c1,c2); 1437160814Ssimon $LD r6,`5*$BNSZ`(r4) 1438160814Ssimon $LD r7,`6*$BNSZ`(r5) 1439160814Ssimon $UMULL r8,r6,r7 1440160814Ssimon $UMULH r9,r6,r7 1441160814Ssimon addc r12,r12,r8 1442160814Ssimon adde r10,r10,r9 1443160814Ssimon addze r11,r11 1444160814Ssimon #mul_add_c(a[6],b[5],c3,c1,c2); 1445160814Ssimon $LD r6,`6*$BNSZ`(r4) 1446160814Ssimon $LD r7,`5*$BNSZ`(r5) 1447160814Ssimon $UMULL r8,r6,r7 1448160814Ssimon $UMULH r9,r6,r7 1449160814Ssimon addc r12,r12,r8 1450160814Ssimon adde r10,r10,r9 1451160814Ssimon addze r11,r11 1452160814Ssimon #mul_add_c(a[7],b[4],c3,c1,c2); 1453160814Ssimon $LD r6,`7*$BNSZ`(r4) 1454160814Ssimon $LD r7,`4*$BNSZ`(r5) 1455160814Ssimon $UMULL r8,r6,r7 1456160814Ssimon $UMULH r9,r6,r7 1457160814Ssimon addc r12,r12,r8 1458160814Ssimon adde r10,r10,r9 1459160814Ssimon addze r11,r11 1460160814Ssimon $ST r12,`11*$BNSZ`(r3) #r[11]=c3; 1461160814Ssimon #mul_add_c(a[7],b[5],c1,c2,c3); 1462160814Ssimon $LD r7,`5*$BNSZ`(r5) 1463160814Ssimon $UMULL r8,r6,r7 1464160814Ssimon $UMULH r9,r6,r7 1465160814Ssimon addc r10,r10,r8 1466160814Ssimon adde r11,r11,r9 1467160814Ssimon addze r12,r0 1468160814Ssimon #mul_add_c(a[6],b[6],c1,c2,c3); 1469160814Ssimon $LD r6,`6*$BNSZ`(r4) 1470160814Ssimon $LD r7,`6*$BNSZ`(r5) 1471160814Ssimon $UMULL r8,r6,r7 1472160814Ssimon $UMULH r9,r6,r7 1473160814Ssimon addc r10,r10,r8 1474160814Ssimon adde r11,r11,r9 1475160814Ssimon addze r12,r12 1476160814Ssimon #mul_add_c(a[5],b[7],c1,c2,c3); 1477160814Ssimon $LD r6,`5*$BNSZ`(r4) 1478160814Ssimon $LD r7,`7*$BNSZ`(r5) 1479160814Ssimon $UMULL r8,r6,r7 1480160814Ssimon $UMULH r9,r6,r7 1481160814Ssimon addc r10,r10,r8 1482160814Ssimon adde r11,r11,r9 1483160814Ssimon addze r12,r12 1484160814Ssimon $ST r10,`12*$BNSZ`(r3) #r[12]=c1; 1485160814Ssimon #mul_add_c(a[6],b[7],c2,c3,c1); 1486160814Ssimon $LD r6,`6*$BNSZ`(r4) 1487160814Ssimon $UMULL r8,r6,r7 1488160814Ssimon $UMULH r9,r6,r7 1489160814Ssimon addc r11,r11,r8 1490160814Ssimon adde r12,r12,r9 1491160814Ssimon addze r10,r0 1492160814Ssimon #mul_add_c(a[7],b[6],c2,c3,c1); 1493160814Ssimon $LD r6,`7*$BNSZ`(r4) 1494160814Ssimon $LD r7,`6*$BNSZ`(r5) 1495160814Ssimon $UMULL r8,r6,r7 1496160814Ssimon $UMULH r9,r6,r7 1497160814Ssimon addc r11,r11,r8 1498160814Ssimon adde r12,r12,r9 1499160814Ssimon addze r10,r10 1500160814Ssimon $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 1501160814Ssimon #mul_add_c(a[7],b[7],c3,c1,c2); 1502160814Ssimon $LD r7,`7*$BNSZ`(r5) 1503160814Ssimon $UMULL r8,r6,r7 1504160814Ssimon $UMULH r9,r6,r7 1505160814Ssimon addc r12,r12,r8 1506160814Ssimon adde r10,r10,r9 1507160814Ssimon $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1508160814Ssimon $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1509238405Sjkim blr 1510238405Sjkim .long 0 1511238405Sjkim .byte 0,12,0x14,0,0,0,3,0 1512238405Sjkim .long 0 1513160814Ssimon 1514160814Ssimon# 1515160814Ssimon# NOTE: The following label name should be changed to 1516160814Ssimon# "bn_sub_words" i.e. remove the first dot 1517160814Ssimon# for the gcc compiler. This should be automatically 1518160814Ssimon# done in the build 1519160814Ssimon# 1520160814Ssimon# 1521160814Ssimon.align 4 1522160814Ssimon.bn_sub_words: 1523160814Ssimon# 1524160814Ssimon# Handcoded version of bn_sub_words 1525160814Ssimon# 1526160814Ssimon#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1527160814Ssimon# 1528160814Ssimon# r3 = r 1529160814Ssimon# r4 = a 1530160814Ssimon# r5 = b 1531160814Ssimon# r6 = n 1532160814Ssimon# 1533160814Ssimon# Note: No loop unrolling done since this is not a performance 1534160814Ssimon# critical loop. 1535160814Ssimon 1536160814Ssimon xor r0,r0,r0 #set r0 = 0 1537160814Ssimon# 1538160814Ssimon# check for r6 = 0 AND set carry bit. 1539160814Ssimon# 1540160814Ssimon subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1541160814Ssimon # if r6 > 0 then result !=0 1542160814Ssimon # In either case carry bit is set. 1543238405Sjkim beq Lppcasm_sub_adios 1544160814Ssimon addi r4,r4,-$BNSZ 1545160814Ssimon addi r3,r3,-$BNSZ 1546160814Ssimon addi r5,r5,-$BNSZ 1547160814Ssimon mtctr r6 1548160814SsimonLppcasm_sub_mainloop: 1549160814Ssimon $LDU r7,$BNSZ(r4) 1550160814Ssimon $LDU r8,$BNSZ(r5) 1551160814Ssimon subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 1552160814Ssimon # if carry = 1 this is r7-r8. Else it 1553160814Ssimon # is r7-r8 -1 as we need. 1554160814Ssimon $STU r6,$BNSZ(r3) 1555238405Sjkim bdnz- Lppcasm_sub_mainloop 1556160814SsimonLppcasm_sub_adios: 1557160814Ssimon subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1558160814Ssimon andi. r3,r3,1 # keep only last bit. 1559238405Sjkim blr 1560238405Sjkim .long 0 1561238405Sjkim .byte 0,12,0x14,0,0,0,4,0 1562238405Sjkim .long 0 1563160814Ssimon 1564160814Ssimon# 1565160814Ssimon# NOTE: The following label name should be changed to 1566160814Ssimon# "bn_add_words" i.e. remove the first dot 1567160814Ssimon# for the gcc compiler. This should be automatically 1568160814Ssimon# done in the build 1569160814Ssimon# 1570160814Ssimon 1571160814Ssimon.align 4 1572160814Ssimon.bn_add_words: 1573160814Ssimon# 1574160814Ssimon# Handcoded version of bn_add_words 1575160814Ssimon# 1576160814Ssimon#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1577160814Ssimon# 1578160814Ssimon# r3 = r 1579160814Ssimon# r4 = a 1580160814Ssimon# r5 = b 1581160814Ssimon# r6 = n 1582160814Ssimon# 1583160814Ssimon# Note: No loop unrolling done since this is not a performance 1584160814Ssimon# critical loop. 1585160814Ssimon 1586160814Ssimon xor r0,r0,r0 1587160814Ssimon# 1588160814Ssimon# check for r6 = 0. Is this needed? 1589160814Ssimon# 1590160814Ssimon addic. r6,r6,0 #test r6 and clear carry bit. 1591238405Sjkim beq Lppcasm_add_adios 1592160814Ssimon addi r4,r4,-$BNSZ 1593160814Ssimon addi r3,r3,-$BNSZ 1594160814Ssimon addi r5,r5,-$BNSZ 1595160814Ssimon mtctr r6 1596160814SsimonLppcasm_add_mainloop: 1597160814Ssimon $LDU r7,$BNSZ(r4) 1598160814Ssimon $LDU r8,$BNSZ(r5) 1599160814Ssimon adde r8,r7,r8 1600160814Ssimon $STU r8,$BNSZ(r3) 1601238405Sjkim bdnz- Lppcasm_add_mainloop 1602160814SsimonLppcasm_add_adios: 1603160814Ssimon addze r3,r0 #return carry bit. 1604238405Sjkim blr 1605238405Sjkim .long 0 1606238405Sjkim .byte 0,12,0x14,0,0,0,4,0 1607238405Sjkim .long 0 1608160814Ssimon 1609160814Ssimon# 1610160814Ssimon# NOTE: The following label name should be changed to 1611160814Ssimon# "bn_div_words" i.e. remove the first dot 1612160814Ssimon# for the gcc compiler. This should be automatically 1613160814Ssimon# done in the build 1614160814Ssimon# 1615160814Ssimon 1616160814Ssimon.align 4 1617160814Ssimon.bn_div_words: 1618160814Ssimon# 1619160814Ssimon# This is a cleaned up version of code generated by 1620160814Ssimon# the AIX compiler. The only optimization is to use 1621160814Ssimon# the PPC instruction to count leading zeros instead 1622160814Ssimon# of call to num_bits_word. Since this was compiled 1623160814Ssimon# only at level -O2 we can possibly squeeze it more? 1624160814Ssimon# 1625160814Ssimon# r3 = h 1626160814Ssimon# r4 = l 1627160814Ssimon# r5 = d 1628160814Ssimon 1629160814Ssimon $UCMPI 0,r5,0 # compare r5 and 0 1630238405Sjkim bne Lppcasm_div1 # proceed if d!=0 1631160814Ssimon li r3,-1 # d=0 return -1 1632238405Sjkim blr 1633160814SsimonLppcasm_div1: 1634160814Ssimon xor r0,r0,r0 #r0=0 1635160814Ssimon li r8,$BITS 1636160814Ssimon $CNTLZ. r7,r5 #r7 = num leading 0s in d. 1637238405Sjkim beq Lppcasm_div2 #proceed if no leading zeros 1638160814Ssimon subf r8,r7,r8 #r8 = BN_num_bits_word(d) 1639160814Ssimon $SHR. r9,r3,r8 #are there any bits above r8'th? 1640160814Ssimon $TR 16,r9,r0 #if there're, signal to dump core... 1641160814SsimonLppcasm_div2: 1642160814Ssimon $UCMP 0,r3,r5 #h>=d? 1643238405Sjkim blt Lppcasm_div3 #goto Lppcasm_div3 if not 1644160814Ssimon subf r3,r5,r3 #h-=d ; 1645160814SsimonLppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1646160814Ssimon cmpi 0,0,r7,0 # is (i == 0)? 1647238405Sjkim beq Lppcasm_div4 1648160814Ssimon $SHL r3,r3,r7 # h = (h<< i) 1649160814Ssimon $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1650160814Ssimon $SHL r5,r5,r7 # d<<=i 1651160814Ssimon or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 1652160814Ssimon $SHL r4,r4,r7 # l <<=i 1653160814SsimonLppcasm_div4: 1654160814Ssimon $SHRI r9,r5,`$BITS/2` # r9 = dh 1655160814Ssimon # dl will be computed when needed 1656160814Ssimon # as it saves registers. 1657160814Ssimon li r6,2 #r6=2 1658160814Ssimon mtctr r6 #counter will be in count. 1659160814SsimonLppcasm_divouterloop: 1660160814Ssimon $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 1661160814Ssimon $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1662160814Ssimon # compute here for innerloop. 1663160814Ssimon $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1664238405Sjkim bne Lppcasm_div5 # goto Lppcasm_div5 if not 1665160814Ssimon 1666160814Ssimon li r8,-1 1667160814Ssimon $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1668160814Ssimon b Lppcasm_div6 1669160814SsimonLppcasm_div5: 1670160814Ssimon $UDIV r8,r3,r9 #q = h/dh 1671160814SsimonLppcasm_div6: 1672160814Ssimon $UMULL r12,r9,r8 #th = q*dh 1673160814Ssimon $CLRU r10,r5,`$BITS/2` #r10=dl 1674160814Ssimon $UMULL r6,r8,r10 #tl = q*dl 1675160814Ssimon 1676160814SsimonLppcasm_divinnerloop: 1677160814Ssimon subf r10,r12,r3 #t = h -th 1678160814Ssimon $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 1679160814Ssimon addic. r7,r7,0 #test if r7 == 0. used below. 1680160814Ssimon # now want to compute 1681160814Ssimon # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) 1682160814Ssimon # the following 2 instructions do that 1683160814Ssimon $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1684160814Ssimon or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1685238405Sjkim $UCMP cr1,r6,r7 # compare (tl <= r7) 1686238405Sjkim bne Lppcasm_divinnerexit 1687238405Sjkim ble cr1,Lppcasm_divinnerexit 1688160814Ssimon addi r8,r8,-1 #q-- 1689160814Ssimon subf r12,r9,r12 #th -=dh 1690160814Ssimon $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1691160814Ssimon subf r6,r10,r6 #tl -=dl 1692160814Ssimon b Lppcasm_divinnerloop 1693160814SsimonLppcasm_divinnerexit: 1694160814Ssimon $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1695160814Ssimon $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1696238405Sjkim $UCMP cr1,r4,r11 # compare l and tl 1697160814Ssimon add r12,r12,r10 # th+=t 1698238405Sjkim bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1699160814Ssimon addi r12,r12,1 # th++ 1700160814SsimonLppcasm_div7: 1701160814Ssimon subf r11,r11,r4 #r11=l-tl 1702238405Sjkim $UCMP cr1,r3,r12 #compare h and th 1703238405Sjkim bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 1704160814Ssimon addi r8,r8,-1 # q-- 1705160814Ssimon add r3,r5,r3 # h+=d 1706160814SsimonLppcasm_div8: 1707160814Ssimon subf r12,r12,r3 #r12 = h-th 1708160814Ssimon $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 1709160814Ssimon # want to compute 1710160814Ssimon # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 1711160814Ssimon # the following 2 instructions will do this. 1712160814Ssimon $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 1713160814Ssimon $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1714238405Sjkim bdz Lppcasm_div9 #if (count==0) break ; 1715160814Ssimon $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1716160814Ssimon b Lppcasm_divouterloop 1717160814SsimonLppcasm_div9: 1718160814Ssimon or r3,r8,r0 1719238405Sjkim blr 1720238405Sjkim .long 0 1721238405Sjkim .byte 0,12,0x14,0,0,0,3,0 1722238405Sjkim .long 0 1723160814Ssimon 1724160814Ssimon# 1725160814Ssimon# NOTE: The following label name should be changed to 1726160814Ssimon# "bn_sqr_words" i.e. remove the first dot 1727160814Ssimon# for the gcc compiler. This should be automatically 1728160814Ssimon# done in the build 1729160814Ssimon# 1730160814Ssimon.align 4 1731160814Ssimon.bn_sqr_words: 1732160814Ssimon# 1733160814Ssimon# Optimized version of bn_sqr_words 1734160814Ssimon# 1735160814Ssimon# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 1736160814Ssimon# 1737160814Ssimon# r3 = r 1738160814Ssimon# r4 = a 1739160814Ssimon# r5 = n 1740160814Ssimon# 1741160814Ssimon# r6 = a[i]. 1742160814Ssimon# r7,r8 = product. 1743160814Ssimon# 1744160814Ssimon# No unrolling done here. Not performance critical. 1745160814Ssimon 1746160814Ssimon addic. r5,r5,0 #test r5. 1747238405Sjkim beq Lppcasm_sqr_adios 1748160814Ssimon addi r4,r4,-$BNSZ 1749160814Ssimon addi r3,r3,-$BNSZ 1750160814Ssimon mtctr r5 1751160814SsimonLppcasm_sqr_mainloop: 1752160814Ssimon #sqr(r[0],r[1],a[0]); 1753160814Ssimon $LDU r6,$BNSZ(r4) 1754160814Ssimon $UMULL r7,r6,r6 1755160814Ssimon $UMULH r8,r6,r6 1756160814Ssimon $STU r7,$BNSZ(r3) 1757160814Ssimon $STU r8,$BNSZ(r3) 1758238405Sjkim bdnz- Lppcasm_sqr_mainloop 1759160814SsimonLppcasm_sqr_adios: 1760238405Sjkim blr 1761238405Sjkim .long 0 1762238405Sjkim .byte 0,12,0x14,0,0,0,3,0 1763238405Sjkim .long 0 1764160814Ssimon 1765160814Ssimon# 1766160814Ssimon# NOTE: The following label name should be changed to 1767160814Ssimon# "bn_mul_words" i.e. remove the first dot 1768160814Ssimon# for the gcc compiler. This should be automatically 1769160814Ssimon# done in the build 1770160814Ssimon# 1771160814Ssimon 1772160814Ssimon.align 4 1773160814Ssimon.bn_mul_words: 1774160814Ssimon# 1775160814Ssimon# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1776160814Ssimon# 1777160814Ssimon# r3 = rp 1778160814Ssimon# r4 = ap 1779160814Ssimon# r5 = num 1780160814Ssimon# r6 = w 1781160814Ssimon xor r0,r0,r0 1782160814Ssimon xor r12,r12,r12 # used for carry 1783160814Ssimon rlwinm. r7,r5,30,2,31 # num >> 2 1784238405Sjkim beq Lppcasm_mw_REM 1785160814Ssimon mtctr r7 1786160814SsimonLppcasm_mw_LOOP: 1787160814Ssimon #mul(rp[0],ap[0],w,c1); 1788160814Ssimon $LD r8,`0*$BNSZ`(r4) 1789160814Ssimon $UMULL r9,r6,r8 1790160814Ssimon $UMULH r10,r6,r8 1791160814Ssimon addc r9,r9,r12 1792160814Ssimon #addze r10,r10 #carry is NOT ignored. 1793160814Ssimon #will be taken care of 1794160814Ssimon #in second spin below 1795160814Ssimon #using adde. 1796160814Ssimon $ST r9,`0*$BNSZ`(r3) 1797160814Ssimon #mul(rp[1],ap[1],w,c1); 1798160814Ssimon $LD r8,`1*$BNSZ`(r4) 1799160814Ssimon $UMULL r11,r6,r8 1800160814Ssimon $UMULH r12,r6,r8 1801160814Ssimon adde r11,r11,r10 1802160814Ssimon #addze r12,r12 1803160814Ssimon $ST r11,`1*$BNSZ`(r3) 1804160814Ssimon #mul(rp[2],ap[2],w,c1); 1805160814Ssimon $LD r8,`2*$BNSZ`(r4) 1806160814Ssimon $UMULL r9,r6,r8 1807160814Ssimon $UMULH r10,r6,r8 1808160814Ssimon adde r9,r9,r12 1809160814Ssimon #addze r10,r10 1810160814Ssimon $ST r9,`2*$BNSZ`(r3) 1811160814Ssimon #mul_add(rp[3],ap[3],w,c1); 1812160814Ssimon $LD r8,`3*$BNSZ`(r4) 1813160814Ssimon $UMULL r11,r6,r8 1814160814Ssimon $UMULH r12,r6,r8 1815160814Ssimon adde r11,r11,r10 1816160814Ssimon addze r12,r12 #this spin we collect carry into 1817160814Ssimon #r12 1818160814Ssimon $ST r11,`3*$BNSZ`(r3) 1819160814Ssimon 1820160814Ssimon addi r3,r3,`4*$BNSZ` 1821160814Ssimon addi r4,r4,`4*$BNSZ` 1822238405Sjkim bdnz- Lppcasm_mw_LOOP 1823160814Ssimon 1824160814SsimonLppcasm_mw_REM: 1825160814Ssimon andi. r5,r5,0x3 1826238405Sjkim beq Lppcasm_mw_OVER 1827160814Ssimon #mul(rp[0],ap[0],w,c1); 1828160814Ssimon $LD r8,`0*$BNSZ`(r4) 1829160814Ssimon $UMULL r9,r6,r8 1830160814Ssimon $UMULH r10,r6,r8 1831160814Ssimon addc r9,r9,r12 1832160814Ssimon addze r10,r10 1833160814Ssimon $ST r9,`0*$BNSZ`(r3) 1834160814Ssimon addi r12,r10,0 1835160814Ssimon 1836160814Ssimon addi r5,r5,-1 1837160814Ssimon cmpli 0,0,r5,0 1838238405Sjkim beq Lppcasm_mw_OVER 1839160814Ssimon 1840160814Ssimon 1841160814Ssimon #mul(rp[1],ap[1],w,c1); 1842160814Ssimon $LD r8,`1*$BNSZ`(r4) 1843160814Ssimon $UMULL r9,r6,r8 1844160814Ssimon $UMULH r10,r6,r8 1845160814Ssimon addc r9,r9,r12 1846160814Ssimon addze r10,r10 1847160814Ssimon $ST r9,`1*$BNSZ`(r3) 1848160814Ssimon addi r12,r10,0 1849160814Ssimon 1850160814Ssimon addi r5,r5,-1 1851160814Ssimon cmpli 0,0,r5,0 1852238405Sjkim beq Lppcasm_mw_OVER 1853160814Ssimon 1854160814Ssimon #mul_add(rp[2],ap[2],w,c1); 1855160814Ssimon $LD r8,`2*$BNSZ`(r4) 1856160814Ssimon $UMULL r9,r6,r8 1857160814Ssimon $UMULH r10,r6,r8 1858160814Ssimon addc r9,r9,r12 1859160814Ssimon addze r10,r10 1860160814Ssimon $ST r9,`2*$BNSZ`(r3) 1861160814Ssimon addi r12,r10,0 1862160814Ssimon 1863160814SsimonLppcasm_mw_OVER: 1864160814Ssimon addi r3,r12,0 1865238405Sjkim blr 1866238405Sjkim .long 0 1867238405Sjkim .byte 0,12,0x14,0,0,0,4,0 1868238405Sjkim .long 0 1869160814Ssimon 1870160814Ssimon# 1871160814Ssimon# NOTE: The following label name should be changed to 1872160814Ssimon# "bn_mul_add_words" i.e. remove the first dot 1873160814Ssimon# for the gcc compiler. This should be automatically 1874160814Ssimon# done in the build 1875160814Ssimon# 1876160814Ssimon 1877160814Ssimon.align 4 1878160814Ssimon.bn_mul_add_words: 1879160814Ssimon# 1880160814Ssimon# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1881160814Ssimon# 1882160814Ssimon# r3 = rp 1883160814Ssimon# r4 = ap 1884160814Ssimon# r5 = num 1885160814Ssimon# r6 = w 1886160814Ssimon# 1887160814Ssimon# empirical evidence suggests that unrolled version performs best!! 1888160814Ssimon# 1889160814Ssimon xor r0,r0,r0 #r0 = 0 1890160814Ssimon xor r12,r12,r12 #r12 = 0 . used for carry 1891160814Ssimon rlwinm. r7,r5,30,2,31 # num >> 2 1892238405Sjkim beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 1893160814Ssimon mtctr r7 1894160814SsimonLppcasm_maw_mainloop: 1895160814Ssimon #mul_add(rp[0],ap[0],w,c1); 1896160814Ssimon $LD r8,`0*$BNSZ`(r4) 1897160814Ssimon $LD r11,`0*$BNSZ`(r3) 1898160814Ssimon $UMULL r9,r6,r8 1899160814Ssimon $UMULH r10,r6,r8 1900160814Ssimon addc r9,r9,r12 #r12 is carry. 1901160814Ssimon addze r10,r10 1902160814Ssimon addc r9,r9,r11 1903160814Ssimon #addze r10,r10 1904160814Ssimon #the above instruction addze 1905160814Ssimon #is NOT needed. Carry will NOT 1906160814Ssimon #be ignored. It's not affected 1907160814Ssimon #by multiply and will be collected 1908160814Ssimon #in the next spin 1909160814Ssimon $ST r9,`0*$BNSZ`(r3) 1910160814Ssimon 1911160814Ssimon #mul_add(rp[1],ap[1],w,c1); 1912160814Ssimon $LD r8,`1*$BNSZ`(r4) 1913160814Ssimon $LD r9,`1*$BNSZ`(r3) 1914160814Ssimon $UMULL r11,r6,r8 1915160814Ssimon $UMULH r12,r6,r8 1916160814Ssimon adde r11,r11,r10 #r10 is carry. 1917160814Ssimon addze r12,r12 1918160814Ssimon addc r11,r11,r9 1919160814Ssimon #addze r12,r12 1920160814Ssimon $ST r11,`1*$BNSZ`(r3) 1921160814Ssimon 1922160814Ssimon #mul_add(rp[2],ap[2],w,c1); 1923160814Ssimon $LD r8,`2*$BNSZ`(r4) 1924160814Ssimon $UMULL r9,r6,r8 1925160814Ssimon $LD r11,`2*$BNSZ`(r3) 1926160814Ssimon $UMULH r10,r6,r8 1927160814Ssimon adde r9,r9,r12 1928160814Ssimon addze r10,r10 1929160814Ssimon addc r9,r9,r11 1930160814Ssimon #addze r10,r10 1931160814Ssimon $ST r9,`2*$BNSZ`(r3) 1932160814Ssimon 1933160814Ssimon #mul_add(rp[3],ap[3],w,c1); 1934160814Ssimon $LD r8,`3*$BNSZ`(r4) 1935160814Ssimon $UMULL r11,r6,r8 1936160814Ssimon $LD r9,`3*$BNSZ`(r3) 1937160814Ssimon $UMULH r12,r6,r8 1938160814Ssimon adde r11,r11,r10 1939160814Ssimon addze r12,r12 1940160814Ssimon addc r11,r11,r9 1941160814Ssimon addze r12,r12 1942160814Ssimon $ST r11,`3*$BNSZ`(r3) 1943160814Ssimon addi r3,r3,`4*$BNSZ` 1944160814Ssimon addi r4,r4,`4*$BNSZ` 1945238405Sjkim bdnz- Lppcasm_maw_mainloop 1946160814Ssimon 1947160814SsimonLppcasm_maw_leftover: 1948160814Ssimon andi. r5,r5,0x3 1949238405Sjkim beq Lppcasm_maw_adios 1950160814Ssimon addi r3,r3,-$BNSZ 1951160814Ssimon addi r4,r4,-$BNSZ 1952160814Ssimon #mul_add(rp[0],ap[0],w,c1); 1953160814Ssimon mtctr r5 1954160814Ssimon $LDU r8,$BNSZ(r4) 1955160814Ssimon $UMULL r9,r6,r8 1956160814Ssimon $UMULH r10,r6,r8 1957160814Ssimon $LDU r11,$BNSZ(r3) 1958160814Ssimon addc r9,r9,r11 1959160814Ssimon addze r10,r10 1960160814Ssimon addc r9,r9,r12 1961160814Ssimon addze r12,r10 1962160814Ssimon $ST r9,0(r3) 1963160814Ssimon 1964238405Sjkim bdz Lppcasm_maw_adios 1965160814Ssimon #mul_add(rp[1],ap[1],w,c1); 1966160814Ssimon $LDU r8,$BNSZ(r4) 1967160814Ssimon $UMULL r9,r6,r8 1968160814Ssimon $UMULH r10,r6,r8 1969160814Ssimon $LDU r11,$BNSZ(r3) 1970160814Ssimon addc r9,r9,r11 1971160814Ssimon addze r10,r10 1972160814Ssimon addc r9,r9,r12 1973160814Ssimon addze r12,r10 1974160814Ssimon $ST r9,0(r3) 1975160814Ssimon 1976238405Sjkim bdz Lppcasm_maw_adios 1977160814Ssimon #mul_add(rp[2],ap[2],w,c1); 1978160814Ssimon $LDU r8,$BNSZ(r4) 1979160814Ssimon $UMULL r9,r6,r8 1980160814Ssimon $UMULH r10,r6,r8 1981160814Ssimon $LDU r11,$BNSZ(r3) 1982160814Ssimon addc r9,r9,r11 1983160814Ssimon addze r10,r10 1984160814Ssimon addc r9,r9,r12 1985160814Ssimon addze r12,r10 1986160814Ssimon $ST r9,0(r3) 1987160814Ssimon 1988160814SsimonLppcasm_maw_adios: 1989160814Ssimon addi r3,r12,0 1990238405Sjkim blr 1991238405Sjkim .long 0 1992238405Sjkim .byte 0,12,0x14,0,0,0,4,0 1993238405Sjkim .long 0 1994160814Ssimon .align 4 1995160814SsimonEOF 1996238405Sjkim$data =~ s/\`([^\`]*)\`/eval $1/gem; 1997238405Sjkimprint $data; 1998238405Sjkimclose STDOUT; 1999