1160814Ssimon#!/usr/bin/env perl 2160814Ssimon# 3160814Ssimon# Implemented as a Perl wrapper as we want to support several different 4160814Ssimon# architectures with single file. We pick up the target based on the 5160814Ssimon# file name we are asked to generate. 6160814Ssimon# 7160814Ssimon# It should be noted though that this perl code is nothing like 8160814Ssimon# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 9160814Ssimon# as pre-processor to cover for platform differences in name decoration, 10160814Ssimon# linker tables, 32-/64-bit instruction sets... 11160814Ssimon# 12160814Ssimon# As you might know there're several PowerPC ABI in use. Most notably 13160814Ssimon# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs 14160814Ssimon# are similar enough to implement leaf(!) functions, which would be ABI 15160814Ssimon# neutral. And that's what you find here: ABI neutral leaf functions. 16160814Ssimon# In case you wonder what that is... 17160814Ssimon# 18160814Ssimon# AIX performance 19160814Ssimon# 20160814Ssimon# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. 21160814Ssimon# 22160814Ssimon# The following is the performance of 32-bit compiler 23160814Ssimon# generated code: 24160814Ssimon# 25160814Ssimon# OpenSSL 0.9.6c 21 dec 2001 26160814Ssimon# built on: Tue Jun 11 11:06:51 EDT 2002 27160814Ssimon# options:bn(64,32) ... 28160814Ssimon#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 29160814Ssimon# sign verify sign/s verify/s 30160814Ssimon#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 31160814Ssimon#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 32160814Ssimon#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 33160814Ssimon#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 34160814Ssimon#dsa 512 bits 0.0087s 0.0106s 114.3 94.5 35160814Ssimon#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 36160814Ssimon# 37160814Ssimon# Same bechmark with this assembler code: 38160814Ssimon# 39160814Ssimon#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 40160814Ssimon#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 41160814Ssimon#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 42160814Ssimon#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 43160814Ssimon#dsa 512 bits 0.0052s 0.0062s 191.6 162.0 44160814Ssimon#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 45160814Ssimon# 46160814Ssimon# Number of operations increases by at almost 75% 47160814Ssimon# 48160814Ssimon# Here are performance numbers for 64-bit compiler 49160814Ssimon# generated code: 50160814Ssimon# 51160814Ssimon# OpenSSL 0.9.6g [engine] 9 Aug 2002 52160814Ssimon# built on: Fri Apr 18 16:59:20 EDT 2003 53160814Ssimon# options:bn(64,64) ... 54160814Ssimon# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 55160814Ssimon# sign verify sign/s verify/s 56160814Ssimon#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 57160814Ssimon#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 58160814Ssimon#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 59160814Ssimon#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 60160814Ssimon#dsa 512 bits 0.0026s 0.0032s 382.5 313.7 61160814Ssimon#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 62160814Ssimon# 63160814Ssimon# Same benchmark with this assembler code: 64160814Ssimon# 65160814Ssimon#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 66160814Ssimon#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 67160814Ssimon#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 68160814Ssimon#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 69160814Ssimon#dsa 512 bits 0.0016s 0.0020s 610.7 507.1 70160814Ssimon#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 71160814Ssimon# 72160814Ssimon# Again, performance increases by at about 75% 73160814Ssimon# 74160814Ssimon# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) 75160814Ssimon# OpenSSL 0.9.7c 30 Sep 2003 76160814Ssimon# 77160814Ssimon# Original code. 78160814Ssimon# 79160814Ssimon#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 80160814Ssimon#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 81160814Ssimon#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 82160814Ssimon#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 83160814Ssimon#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 84160814Ssimon#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 85160814Ssimon#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 86160814Ssimon# 87160814Ssimon# Same benchmark with this assembler code: 88160814Ssimon# 89160814Ssimon#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 90160814Ssimon#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 91160814Ssimon#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 92160814Ssimon#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 93160814Ssimon#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 94160814Ssimon#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 95160814Ssimon#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 96160814Ssimon# 97160814Ssimon# Performance increase of ~60% 98160814Ssimon# 99160814Ssimon# If you have comments or suggestions to improve code send 100160814Ssimon# me a note at schari@us.ibm.com 101160814Ssimon# 102160814Ssimon 103160814Ssimon$opf = shift; 104160814Ssimon 105160814Ssimonif ($opf =~ /32\.s/) { 106160814Ssimon $BITS= 32; 107160814Ssimon $BNSZ= $BITS/8; 108160814Ssimon $ISA= "\"ppc\""; 109160814Ssimon 110160814Ssimon $LD= "lwz"; # load 111160814Ssimon $LDU= "lwzu"; # load and update 112160814Ssimon $ST= "stw"; # store 113160814Ssimon $STU= "stwu"; # store and update 114160814Ssimon $UMULL= "mullw"; # unsigned multiply low 115160814Ssimon $UMULH= "mulhwu"; # unsigned multiply high 116160814Ssimon $UDIV= "divwu"; # unsigned divide 117160814Ssimon $UCMPI= "cmplwi"; # unsigned compare with immediate 118160814Ssimon $UCMP= "cmplw"; # unsigned compare 119160814Ssimon $CNTLZ= "cntlzw"; # count leading zeros 120160814Ssimon $SHL= "slw"; # shift left 121160814Ssimon $SHR= "srw"; # unsigned shift right 122160814Ssimon $SHRI= "srwi"; # unsigned shift right by immediate 123160814Ssimon $SHLI= "slwi"; # shift left by immediate 124160814Ssimon $CLRU= "clrlwi"; # clear upper bits 125160814Ssimon $INSR= "insrwi"; # insert right 126160814Ssimon $ROTL= "rotlwi"; # rotate left by immediate 127160814Ssimon $TR= "tw"; # conditional trap 128160814Ssimon} elsif ($opf =~ /64\.s/) { 129160814Ssimon $BITS= 64; 130160814Ssimon $BNSZ= $BITS/8; 131160814Ssimon $ISA= "\"ppc64\""; 132160814Ssimon 133160814Ssimon # same as above, but 64-bit mnemonics... 134160814Ssimon $LD= "ld"; # load 135160814Ssimon $LDU= "ldu"; # load and update 136160814Ssimon $ST= "std"; # store 137160814Ssimon $STU= "stdu"; # store and update 138160814Ssimon $UMULL= "mulld"; # unsigned multiply low 139160814Ssimon $UMULH= "mulhdu"; # unsigned multiply high 140160814Ssimon $UDIV= "divdu"; # unsigned divide 141160814Ssimon $UCMPI= "cmpldi"; # unsigned compare with immediate 142160814Ssimon $UCMP= "cmpld"; # unsigned compare 143160814Ssimon $CNTLZ= "cntlzd"; # count leading zeros 144160814Ssimon $SHL= "sld"; # shift left 145160814Ssimon $SHR= "srd"; # unsigned shift right 146160814Ssimon $SHRI= "srdi"; # unsigned shift right by immediate 147160814Ssimon $SHLI= "sldi"; # shift left by immediate 148160814Ssimon $CLRU= "clrldi"; # clear upper bits 149160814Ssimon $INSR= "insrdi"; # insert right 150160814Ssimon $ROTL= "rotldi"; # rotate left by immediate 151160814Ssimon $TR= "td"; # conditional trap 152160814Ssimon} else { die "nonsense $opf"; } 153160814Ssimon 154160814Ssimon( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!"; 155160814Ssimon 156160814Ssimon# function entry points from the AIX code 157160814Ssimon# 158160814Ssimon# There are other, more elegant, ways to handle this. We (IBM) chose 159160814Ssimon# this approach as it plays well with scripts we run to 'namespace' 160160814Ssimon# OpenSSL .i.e. we add a prefix to all the public symbols so we can 161160814Ssimon# co-exist in the same process with other implementations of OpenSSL. 162160814Ssimon# 'cleverer' ways of doing these substitutions tend to hide data we 163160814Ssimon# need to be obvious. 164160814Ssimon# 165160814Ssimonmy @items = ("bn_sqr_comba4", 166160814Ssimon "bn_sqr_comba8", 167160814Ssimon "bn_mul_comba4", 168160814Ssimon "bn_mul_comba8", 169160814Ssimon "bn_sub_words", 170160814Ssimon "bn_add_words", 171160814Ssimon "bn_div_words", 172160814Ssimon "bn_sqr_words", 173160814Ssimon "bn_mul_words", 174160814Ssimon "bn_mul_add_words"); 175160814Ssimon 176160814Ssimonif ($opf =~ /linux/) { do_linux(); } 177160814Ssimonelsif ($opf =~ /aix/) { do_aix(); } 178160814Ssimonelsif ($opf =~ /osx/) { do_osx(); } 179160814Ssimonelse { do_bsd(); } 180160814Ssimon 181160814Ssimonsub do_linux { 182160814Ssimon $d=&data(); 183160814Ssimon 184160814Ssimon if ($BITS==64) { 185160814Ssimon foreach $t (@items) { 186160814Ssimon $d =~ s/\.$t:/\ 187160814Ssimon\t.section\t".opd","aw"\ 188160814Ssimon\t.align\t3\ 189160814Ssimon\t.globl\t$t\ 190160814Ssimon$t:\ 191160814Ssimon\t.quad\t.$t,.TOC.\@tocbase,0\ 192160814Ssimon\t.size\t$t,24\ 193160814Ssimon\t.previous\n\ 194160814Ssimon\t.type\t.$t,\@function\ 195160814Ssimon\t.globl\t.$t\ 196160814Ssimon.$t:/g; 197160814Ssimon } 198160814Ssimon } 199160814Ssimon else { 200160814Ssimon foreach $t (@items) { 201160814Ssimon $d=~s/\.$t/$t/g; 202160814Ssimon } 203160814Ssimon } 204160814Ssimon # hide internal labels to avoid pollution of name table... 205160814Ssimon $d=~s/Lppcasm_/.Lppcasm_/gm; 206160814Ssimon print $d; 207160814Ssimon} 208160814Ssimon 209160814Ssimonsub do_aix { 210160814Ssimon # AIX assembler is smart enough to please the linker without 211160814Ssimon # making us do something special... 212160814Ssimon print &data(); 213160814Ssimon} 214160814Ssimon 215160814Ssimon# MacOSX 32 bit 216160814Ssimonsub do_osx { 217160814Ssimon $d=&data(); 218160814Ssimon # Change the bn symbol prefix from '.' to '_' 219160814Ssimon foreach $t (@items) { 220160814Ssimon $d=~s/\.$t/_$t/g; 221160814Ssimon } 222160814Ssimon # Change .machine to something OS X asm will accept 223160814Ssimon $d=~s/\.machine.*/.text/g; 224160814Ssimon $d=~s/\#/;/g; # change comment from '#' to ';' 225160814Ssimon print $d; 226160814Ssimon} 227160814Ssimon 228160814Ssimon# BSD (Untested) 229160814Ssimonsub do_bsd { 230160814Ssimon $d=&data(); 231160814Ssimon foreach $t (@items) { 232160814Ssimon $d=~s/\.$t/_$t/g; 233160814Ssimon } 234160814Ssimon print $d; 235160814Ssimon} 236160814Ssimon 237160814Ssimonsub data { 238160814Ssimon local($data)=<<EOF; 239160814Ssimon#-------------------------------------------------------------------- 240160814Ssimon# 241160814Ssimon# 242160814Ssimon# 243160814Ssimon# 244160814Ssimon# File: ppc32.s 245160814Ssimon# 246160814Ssimon# Created by: Suresh Chari 247160814Ssimon# IBM Thomas J. Watson Research Library 248160814Ssimon# Hawthorne, NY 249160814Ssimon# 250160814Ssimon# 251160814Ssimon# Description: Optimized assembly routines for OpenSSL crypto 252160814Ssimon# on the 32 bitPowerPC platform. 253160814Ssimon# 254160814Ssimon# 255160814Ssimon# Version History 256160814Ssimon# 257160814Ssimon# 2. Fixed bn_add,bn_sub and bn_div_words, added comments, 258160814Ssimon# cleaned up code. Also made a single version which can 259160814Ssimon# be used for both the AIX and Linux compilers. See NOTE 260160814Ssimon# below. 261160814Ssimon# 12/05/03 Suresh Chari 262160814Ssimon# (with lots of help from) Andy Polyakov 263160814Ssimon## 264160814Ssimon# 1. Initial version 10/20/02 Suresh Chari 265160814Ssimon# 266160814Ssimon# 267160814Ssimon# The following file works for the xlc,cc 268160814Ssimon# and gcc compilers. 269160814Ssimon# 270160814Ssimon# NOTE: To get the file to link correctly with the gcc compiler 271160814Ssimon# you have to change the names of the routines and remove 272160814Ssimon# the first .(dot) character. This should automatically 273160814Ssimon# be done in the build process. 274160814Ssimon# 275160814Ssimon# Hand optimized assembly code for the following routines 276160814Ssimon# 277160814Ssimon# bn_sqr_comba4 278160814Ssimon# bn_sqr_comba8 279160814Ssimon# bn_mul_comba4 280160814Ssimon# bn_mul_comba8 281160814Ssimon# bn_sub_words 282160814Ssimon# bn_add_words 283160814Ssimon# bn_div_words 284160814Ssimon# bn_sqr_words 285160814Ssimon# bn_mul_words 286160814Ssimon# bn_mul_add_words 287160814Ssimon# 288160814Ssimon# NOTE: It is possible to optimize this code more for 289160814Ssimon# specific PowerPC or Power architectures. On the Northstar 290160814Ssimon# architecture the optimizations in this file do 291160814Ssimon# NOT provide much improvement. 292160814Ssimon# 293160814Ssimon# If you have comments or suggestions to improve code send 294160814Ssimon# me a note at schari\@us.ibm.com 295160814Ssimon# 296160814Ssimon#-------------------------------------------------------------------------- 297160814Ssimon# 298160814Ssimon# Defines to be used in the assembly code. 299160814Ssimon# 300160814Ssimon.set r0,0 # we use it as storage for value of 0 301160814Ssimon.set SP,1 # preserved 302160814Ssimon.set RTOC,2 # preserved 303160814Ssimon.set r3,3 # 1st argument/return value 304160814Ssimon.set r4,4 # 2nd argument/volatile register 305160814Ssimon.set r5,5 # 3rd argument/volatile register 306160814Ssimon.set r6,6 # ... 307160814Ssimon.set r7,7 308160814Ssimon.set r8,8 309160814Ssimon.set r9,9 310160814Ssimon.set r10,10 311160814Ssimon.set r11,11 312160814Ssimon.set r12,12 313160814Ssimon.set r13,13 # not used, nor any other "below" it... 314160814Ssimon 315160814Ssimon.set BO_IF_NOT,4 316160814Ssimon.set BO_IF,12 317160814Ssimon.set BO_dCTR_NZERO,16 318160814Ssimon.set BO_dCTR_ZERO,18 319160814Ssimon.set BO_ALWAYS,20 320160814Ssimon.set CR0_LT,0; 321160814Ssimon.set CR0_GT,1; 322160814Ssimon.set CR0_EQ,2 323160814Ssimon.set CR1_FX,4; 324160814Ssimon.set CR1_FEX,5; 325160814Ssimon.set CR1_VX,6 326160814Ssimon.set LR,8 327160814Ssimon 328160814Ssimon# Declare function names to be global 329160814Ssimon# NOTE: For gcc these names MUST be changed to remove 330160814Ssimon# the first . i.e. for example change ".bn_sqr_comba4" 331160814Ssimon# to "bn_sqr_comba4". This should be automatically done 332160814Ssimon# in the build. 333160814Ssimon 334160814Ssimon .globl .bn_sqr_comba4 335160814Ssimon .globl .bn_sqr_comba8 336160814Ssimon .globl .bn_mul_comba4 337160814Ssimon .globl .bn_mul_comba8 338160814Ssimon .globl .bn_sub_words 339160814Ssimon .globl .bn_add_words 340160814Ssimon .globl .bn_div_words 341160814Ssimon .globl .bn_sqr_words 342160814Ssimon .globl .bn_mul_words 343160814Ssimon .globl .bn_mul_add_words 344160814Ssimon 345160814Ssimon# .text section 346160814Ssimon 347160814Ssimon .machine $ISA 348160814Ssimon 349160814Ssimon# 350160814Ssimon# NOTE: The following label name should be changed to 351160814Ssimon# "bn_sqr_comba4" i.e. remove the first dot 352160814Ssimon# for the gcc compiler. This should be automatically 353160814Ssimon# done in the build 354160814Ssimon# 355160814Ssimon 356160814Ssimon.align 4 357160814Ssimon.bn_sqr_comba4: 358160814Ssimon# 359160814Ssimon# Optimized version of bn_sqr_comba4. 360160814Ssimon# 361160814Ssimon# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 362160814Ssimon# r3 contains r 363160814Ssimon# r4 contains a 364160814Ssimon# 365160814Ssimon# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 366160814Ssimon# 367160814Ssimon# r5,r6 are the two BN_ULONGs being multiplied. 368160814Ssimon# r7,r8 are the results of the 32x32 giving 64 bit multiply. 369160814Ssimon# r9,r10, r11 are the equivalents of c1,c2, c3. 370160814Ssimon# Here's the assembly 371160814Ssimon# 372160814Ssimon# 373160814Ssimon xor r0,r0,r0 # set r0 = 0. Used in the addze 374160814Ssimon # instructions below 375160814Ssimon 376160814Ssimon #sqr_add_c(a,0,c1,c2,c3) 377160814Ssimon $LD r5,`0*$BNSZ`(r4) 378160814Ssimon $UMULL r9,r5,r5 379160814Ssimon $UMULH r10,r5,r5 #in first iteration. No need 380160814Ssimon #to add since c1=c2=c3=0. 381160814Ssimon # Note c3(r11) is NOT set to 0 382160814Ssimon # but will be. 383160814Ssimon 384160814Ssimon $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 385160814Ssimon # sqr_add_c2(a,1,0,c2,c3,c1); 386160814Ssimon $LD r6,`1*$BNSZ`(r4) 387160814Ssimon $UMULL r7,r5,r6 388160814Ssimon $UMULH r8,r5,r6 389160814Ssimon 390160814Ssimon addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) 391160814Ssimon adde r8,r8,r8 392160814Ssimon addze r9,r0 # catch carry if any. 393160814Ssimon # r9= r0(=0) and carry 394160814Ssimon 395160814Ssimon addc r10,r7,r10 # now add to temp result. 396160814Ssimon addze r11,r8 # r8 added to r11 which is 0 397160814Ssimon addze r9,r9 398160814Ssimon 399160814Ssimon $ST r10,`1*$BNSZ`(r3) #r[1]=c2; 400160814Ssimon #sqr_add_c(a,1,c3,c1,c2) 401160814Ssimon $UMULL r7,r6,r6 402160814Ssimon $UMULH r8,r6,r6 403160814Ssimon addc r11,r7,r11 404160814Ssimon adde r9,r8,r9 405160814Ssimon addze r10,r0 406160814Ssimon #sqr_add_c2(a,2,0,c3,c1,c2) 407160814Ssimon $LD r6,`2*$BNSZ`(r4) 408160814Ssimon $UMULL r7,r5,r6 409160814Ssimon $UMULH r8,r5,r6 410160814Ssimon 411160814Ssimon addc r7,r7,r7 412160814Ssimon adde r8,r8,r8 413160814Ssimon addze r10,r10 414160814Ssimon 415160814Ssimon addc r11,r7,r11 416160814Ssimon adde r9,r8,r9 417160814Ssimon addze r10,r10 418160814Ssimon $ST r11,`2*$BNSZ`(r3) #r[2]=c3 419160814Ssimon #sqr_add_c2(a,3,0,c1,c2,c3); 420160814Ssimon $LD r6,`3*$BNSZ`(r4) 421160814Ssimon $UMULL r7,r5,r6 422160814Ssimon $UMULH r8,r5,r6 423160814Ssimon addc r7,r7,r7 424160814Ssimon adde r8,r8,r8 425160814Ssimon addze r11,r0 426160814Ssimon 427160814Ssimon addc r9,r7,r9 428160814Ssimon adde r10,r8,r10 429160814Ssimon addze r11,r11 430160814Ssimon #sqr_add_c2(a,2,1,c1,c2,c3); 431160814Ssimon $LD r5,`1*$BNSZ`(r4) 432160814Ssimon $LD r6,`2*$BNSZ`(r4) 433160814Ssimon $UMULL r7,r5,r6 434160814Ssimon $UMULH r8,r5,r6 435160814Ssimon 436160814Ssimon addc r7,r7,r7 437160814Ssimon adde r8,r8,r8 438160814Ssimon addze r11,r11 439160814Ssimon addc r9,r7,r9 440160814Ssimon adde r10,r8,r10 441160814Ssimon addze r11,r11 442160814Ssimon $ST r9,`3*$BNSZ`(r3) #r[3]=c1 443160814Ssimon #sqr_add_c(a,2,c2,c3,c1); 444160814Ssimon $UMULL r7,r6,r6 445160814Ssimon $UMULH r8,r6,r6 446160814Ssimon addc r10,r7,r10 447160814Ssimon adde r11,r8,r11 448160814Ssimon addze r9,r0 449160814Ssimon #sqr_add_c2(a,3,1,c2,c3,c1); 450160814Ssimon $LD r6,`3*$BNSZ`(r4) 451160814Ssimon $UMULL r7,r5,r6 452160814Ssimon $UMULH r8,r5,r6 453160814Ssimon addc r7,r7,r7 454160814Ssimon adde r8,r8,r8 455160814Ssimon addze r9,r9 456160814Ssimon 457160814Ssimon addc r10,r7,r10 458160814Ssimon adde r11,r8,r11 459160814Ssimon addze r9,r9 460160814Ssimon $ST r10,`4*$BNSZ`(r3) #r[4]=c2 461160814Ssimon #sqr_add_c2(a,3,2,c3,c1,c2); 462160814Ssimon $LD r5,`2*$BNSZ`(r4) 463160814Ssimon $UMULL r7,r5,r6 464160814Ssimon $UMULH r8,r5,r6 465160814Ssimon addc r7,r7,r7 466160814Ssimon adde r8,r8,r8 467160814Ssimon addze r10,r0 468160814Ssimon 469160814Ssimon addc r11,r7,r11 470160814Ssimon adde r9,r8,r9 471160814Ssimon addze r10,r10 472160814Ssimon $ST r11,`5*$BNSZ`(r3) #r[5] = c3 473160814Ssimon #sqr_add_c(a,3,c1,c2,c3); 474160814Ssimon $UMULL r7,r6,r6 475160814Ssimon $UMULH r8,r6,r6 476160814Ssimon addc r9,r7,r9 477160814Ssimon adde r10,r8,r10 478160814Ssimon 479160814Ssimon $ST r9,`6*$BNSZ`(r3) #r[6]=c1 480160814Ssimon $ST r10,`7*$BNSZ`(r3) #r[7]=c2 481160814Ssimon bclr BO_ALWAYS,CR0_LT 482160814Ssimon .long 0x00000000 483160814Ssimon 484160814Ssimon# 485160814Ssimon# NOTE: The following label name should be changed to 486160814Ssimon# "bn_sqr_comba8" i.e. remove the first dot 487160814Ssimon# for the gcc compiler. This should be automatically 488160814Ssimon# done in the build 489160814Ssimon# 490160814Ssimon 491160814Ssimon.align 4 492160814Ssimon.bn_sqr_comba8: 493160814Ssimon# 494160814Ssimon# This is an optimized version of the bn_sqr_comba8 routine. 495160814Ssimon# Tightly uses the adde instruction 496160814Ssimon# 497160814Ssimon# 498160814Ssimon# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 499160814Ssimon# r3 contains r 500160814Ssimon# r4 contains a 501160814Ssimon# 502160814Ssimon# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 503160814Ssimon# 504160814Ssimon# r5,r6 are the two BN_ULONGs being multiplied. 505160814Ssimon# r7,r8 are the results of the 32x32 giving 64 bit multiply. 506160814Ssimon# r9,r10, r11 are the equivalents of c1,c2, c3. 507160814Ssimon# 508160814Ssimon# Possible optimization of loading all 8 longs of a into registers 509160814Ssimon# doesnt provide any speedup 510160814Ssimon# 511160814Ssimon 512160814Ssimon xor r0,r0,r0 #set r0 = 0.Used in addze 513160814Ssimon #instructions below. 514160814Ssimon 515160814Ssimon #sqr_add_c(a,0,c1,c2,c3); 516160814Ssimon $LD r5,`0*$BNSZ`(r4) 517160814Ssimon $UMULL r9,r5,r5 #1st iteration: no carries. 518160814Ssimon $UMULH r10,r5,r5 519160814Ssimon $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 520160814Ssimon #sqr_add_c2(a,1,0,c2,c3,c1); 521160814Ssimon $LD r6,`1*$BNSZ`(r4) 522160814Ssimon $UMULL r7,r5,r6 523160814Ssimon $UMULH r8,r5,r6 524160814Ssimon 525160814Ssimon addc r10,r7,r10 #add the two register number 526160814Ssimon adde r11,r8,r0 # (r8,r7) to the three register 527160814Ssimon addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 528160814Ssimon 529160814Ssimon addc r10,r7,r10 #add the two register number 530160814Ssimon adde r11,r8,r11 # (r8,r7) to the three register 531160814Ssimon addze r9,r9 # number (r9,r11,r10). 532160814Ssimon 533160814Ssimon $ST r10,`1*$BNSZ`(r3) # r[1]=c2 534160814Ssimon 535160814Ssimon #sqr_add_c(a,1,c3,c1,c2); 536160814Ssimon $UMULL r7,r6,r6 537160814Ssimon $UMULH r8,r6,r6 538160814Ssimon addc r11,r7,r11 539160814Ssimon adde r9,r8,r9 540160814Ssimon addze r10,r0 541160814Ssimon #sqr_add_c2(a,2,0,c3,c1,c2); 542160814Ssimon $LD r6,`2*$BNSZ`(r4) 543160814Ssimon $UMULL r7,r5,r6 544160814Ssimon $UMULH r8,r5,r6 545160814Ssimon 546160814Ssimon addc r11,r7,r11 547160814Ssimon adde r9,r8,r9 548160814Ssimon addze r10,r10 549160814Ssimon 550160814Ssimon addc r11,r7,r11 551160814Ssimon adde r9,r8,r9 552160814Ssimon addze r10,r10 553160814Ssimon 554160814Ssimon $ST r11,`2*$BNSZ`(r3) #r[2]=c3 555160814Ssimon #sqr_add_c2(a,3,0,c1,c2,c3); 556160814Ssimon $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. 557160814Ssimon $UMULL r7,r5,r6 558160814Ssimon $UMULH r8,r5,r6 559160814Ssimon 560160814Ssimon addc r9,r7,r9 561160814Ssimon adde r10,r8,r10 562160814Ssimon addze r11,r0 563160814Ssimon 564160814Ssimon addc r9,r7,r9 565160814Ssimon adde r10,r8,r10 566160814Ssimon addze r11,r11 567160814Ssimon #sqr_add_c2(a,2,1,c1,c2,c3); 568160814Ssimon $LD r5,`1*$BNSZ`(r4) 569160814Ssimon $LD r6,`2*$BNSZ`(r4) 570160814Ssimon $UMULL r7,r5,r6 571160814Ssimon $UMULH r8,r5,r6 572160814Ssimon 573160814Ssimon addc r9,r7,r9 574160814Ssimon adde r10,r8,r10 575160814Ssimon addze r11,r11 576160814Ssimon 577160814Ssimon addc r9,r7,r9 578160814Ssimon adde r10,r8,r10 579160814Ssimon addze r11,r11 580160814Ssimon 581160814Ssimon $ST r9,`3*$BNSZ`(r3) #r[3]=c1; 582160814Ssimon #sqr_add_c(a,2,c2,c3,c1); 583160814Ssimon $UMULL r7,r6,r6 584160814Ssimon $UMULH r8,r6,r6 585160814Ssimon 586160814Ssimon addc r10,r7,r10 587160814Ssimon adde r11,r8,r11 588160814Ssimon addze r9,r0 589160814Ssimon #sqr_add_c2(a,3,1,c2,c3,c1); 590160814Ssimon $LD r6,`3*$BNSZ`(r4) 591160814Ssimon $UMULL r7,r5,r6 592160814Ssimon $UMULH r8,r5,r6 593160814Ssimon 594160814Ssimon addc r10,r7,r10 595160814Ssimon adde r11,r8,r11 596160814Ssimon addze r9,r9 597160814Ssimon 598160814Ssimon addc r10,r7,r10 599160814Ssimon adde r11,r8,r11 600160814Ssimon addze r9,r9 601160814Ssimon #sqr_add_c2(a,4,0,c2,c3,c1); 602160814Ssimon $LD r5,`0*$BNSZ`(r4) 603160814Ssimon $LD r6,`4*$BNSZ`(r4) 604160814Ssimon $UMULL r7,r5,r6 605160814Ssimon $UMULH r8,r5,r6 606160814Ssimon 607160814Ssimon addc r10,r7,r10 608160814Ssimon adde r11,r8,r11 609160814Ssimon addze r9,r9 610160814Ssimon 611160814Ssimon addc r10,r7,r10 612160814Ssimon adde r11,r8,r11 613160814Ssimon addze r9,r9 614160814Ssimon $ST r10,`4*$BNSZ`(r3) #r[4]=c2; 615160814Ssimon #sqr_add_c2(a,5,0,c3,c1,c2); 616160814Ssimon $LD r6,`5*$BNSZ`(r4) 617160814Ssimon $UMULL r7,r5,r6 618160814Ssimon $UMULH r8,r5,r6 619160814Ssimon 620160814Ssimon addc r11,r7,r11 621160814Ssimon adde r9,r8,r9 622160814Ssimon addze r10,r0 623160814Ssimon 624160814Ssimon addc r11,r7,r11 625160814Ssimon adde r9,r8,r9 626160814Ssimon addze r10,r10 627160814Ssimon #sqr_add_c2(a,4,1,c3,c1,c2); 628160814Ssimon $LD r5,`1*$BNSZ`(r4) 629160814Ssimon $LD r6,`4*$BNSZ`(r4) 630160814Ssimon $UMULL r7,r5,r6 631160814Ssimon $UMULH r8,r5,r6 632160814Ssimon 633160814Ssimon addc r11,r7,r11 634160814Ssimon adde r9,r8,r9 635160814Ssimon addze r10,r10 636160814Ssimon 637160814Ssimon addc r11,r7,r11 638160814Ssimon adde r9,r8,r9 639160814Ssimon addze r10,r10 640160814Ssimon #sqr_add_c2(a,3,2,c3,c1,c2); 641160814Ssimon $LD r5,`2*$BNSZ`(r4) 642160814Ssimon $LD r6,`3*$BNSZ`(r4) 643160814Ssimon $UMULL r7,r5,r6 644160814Ssimon $UMULH r8,r5,r6 645160814Ssimon 646160814Ssimon addc r11,r7,r11 647160814Ssimon adde r9,r8,r9 648160814Ssimon addze r10,r10 649160814Ssimon 650160814Ssimon addc r11,r7,r11 651160814Ssimon adde r9,r8,r9 652160814Ssimon addze r10,r10 653160814Ssimon $ST r11,`5*$BNSZ`(r3) #r[5]=c3; 654160814Ssimon #sqr_add_c(a,3,c1,c2,c3); 655160814Ssimon $UMULL r7,r6,r6 656160814Ssimon $UMULH r8,r6,r6 657160814Ssimon addc r9,r7,r9 658160814Ssimon adde r10,r8,r10 659160814Ssimon addze r11,r0 660160814Ssimon #sqr_add_c2(a,4,2,c1,c2,c3); 661160814Ssimon $LD r6,`4*$BNSZ`(r4) 662160814Ssimon $UMULL r7,r5,r6 663160814Ssimon $UMULH r8,r5,r6 664160814Ssimon 665160814Ssimon addc r9,r7,r9 666160814Ssimon adde r10,r8,r10 667160814Ssimon addze r11,r11 668160814Ssimon 669160814Ssimon addc r9,r7,r9 670160814Ssimon adde r10,r8,r10 671160814Ssimon addze r11,r11 672160814Ssimon #sqr_add_c2(a,5,1,c1,c2,c3); 673160814Ssimon $LD r5,`1*$BNSZ`(r4) 674160814Ssimon $LD r6,`5*$BNSZ`(r4) 675160814Ssimon $UMULL r7,r5,r6 676160814Ssimon $UMULH r8,r5,r6 677160814Ssimon 678160814Ssimon addc r9,r7,r9 679160814Ssimon adde r10,r8,r10 680160814Ssimon addze r11,r11 681160814Ssimon 682160814Ssimon addc r9,r7,r9 683160814Ssimon adde r10,r8,r10 684160814Ssimon addze r11,r11 685160814Ssimon #sqr_add_c2(a,6,0,c1,c2,c3); 686160814Ssimon $LD r5,`0*$BNSZ`(r4) 687160814Ssimon $LD r6,`6*$BNSZ`(r4) 688160814Ssimon $UMULL r7,r5,r6 689160814Ssimon $UMULH r8,r5,r6 690160814Ssimon addc r9,r7,r9 691160814Ssimon adde r10,r8,r10 692160814Ssimon addze r11,r11 693160814Ssimon addc r9,r7,r9 694160814Ssimon adde r10,r8,r10 695160814Ssimon addze r11,r11 696160814Ssimon $ST r9,`6*$BNSZ`(r3) #r[6]=c1; 697160814Ssimon #sqr_add_c2(a,7,0,c2,c3,c1); 698160814Ssimon $LD r6,`7*$BNSZ`(r4) 699160814Ssimon $UMULL r7,r5,r6 700160814Ssimon $UMULH r8,r5,r6 701160814Ssimon 702160814Ssimon addc r10,r7,r10 703160814Ssimon adde r11,r8,r11 704160814Ssimon addze r9,r0 705160814Ssimon addc r10,r7,r10 706160814Ssimon adde r11,r8,r11 707160814Ssimon addze r9,r9 708160814Ssimon #sqr_add_c2(a,6,1,c2,c3,c1); 709160814Ssimon $LD r5,`1*$BNSZ`(r4) 710160814Ssimon $LD r6,`6*$BNSZ`(r4) 711160814Ssimon $UMULL r7,r5,r6 712160814Ssimon $UMULH r8,r5,r6 713160814Ssimon 714160814Ssimon addc r10,r7,r10 715160814Ssimon adde r11,r8,r11 716160814Ssimon addze r9,r9 717160814Ssimon addc r10,r7,r10 718160814Ssimon adde r11,r8,r11 719160814Ssimon addze r9,r9 720160814Ssimon #sqr_add_c2(a,5,2,c2,c3,c1); 721160814Ssimon $LD r5,`2*$BNSZ`(r4) 722160814Ssimon $LD r6,`5*$BNSZ`(r4) 723160814Ssimon $UMULL r7,r5,r6 724160814Ssimon $UMULH r8,r5,r6 725160814Ssimon addc r10,r7,r10 726160814Ssimon adde r11,r8,r11 727160814Ssimon addze r9,r9 728160814Ssimon addc r10,r7,r10 729160814Ssimon adde r11,r8,r11 730160814Ssimon addze r9,r9 731160814Ssimon #sqr_add_c2(a,4,3,c2,c3,c1); 732160814Ssimon $LD r5,`3*$BNSZ`(r4) 733160814Ssimon $LD r6,`4*$BNSZ`(r4) 734160814Ssimon $UMULL r7,r5,r6 735160814Ssimon $UMULH r8,r5,r6 736160814Ssimon 737160814Ssimon addc r10,r7,r10 738160814Ssimon adde r11,r8,r11 739160814Ssimon addze r9,r9 740160814Ssimon addc r10,r7,r10 741160814Ssimon adde r11,r8,r11 742160814Ssimon addze r9,r9 743160814Ssimon $ST r10,`7*$BNSZ`(r3) #r[7]=c2; 744160814Ssimon #sqr_add_c(a,4,c3,c1,c2); 745160814Ssimon $UMULL r7,r6,r6 746160814Ssimon $UMULH r8,r6,r6 747160814Ssimon addc r11,r7,r11 748160814Ssimon adde r9,r8,r9 749160814Ssimon addze r10,r0 750160814Ssimon #sqr_add_c2(a,5,3,c3,c1,c2); 751160814Ssimon $LD r6,`5*$BNSZ`(r4) 752160814Ssimon $UMULL r7,r5,r6 753160814Ssimon $UMULH r8,r5,r6 754160814Ssimon addc r11,r7,r11 755160814Ssimon adde r9,r8,r9 756160814Ssimon addze r10,r10 757160814Ssimon addc r11,r7,r11 758160814Ssimon adde r9,r8,r9 759160814Ssimon addze r10,r10 760160814Ssimon #sqr_add_c2(a,6,2,c3,c1,c2); 761160814Ssimon $LD r5,`2*$BNSZ`(r4) 762160814Ssimon $LD r6,`6*$BNSZ`(r4) 763160814Ssimon $UMULL r7,r5,r6 764160814Ssimon $UMULH r8,r5,r6 765160814Ssimon addc r11,r7,r11 766160814Ssimon adde r9,r8,r9 767160814Ssimon addze r10,r10 768160814Ssimon 769160814Ssimon addc r11,r7,r11 770160814Ssimon adde r9,r8,r9 771160814Ssimon addze r10,r10 772160814Ssimon #sqr_add_c2(a,7,1,c3,c1,c2); 773160814Ssimon $LD r5,`1*$BNSZ`(r4) 774160814Ssimon $LD r6,`7*$BNSZ`(r4) 775160814Ssimon $UMULL r7,r5,r6 776160814Ssimon $UMULH r8,r5,r6 777160814Ssimon addc r11,r7,r11 778160814Ssimon adde r9,r8,r9 779160814Ssimon addze r10,r10 780160814Ssimon addc r11,r7,r11 781160814Ssimon adde r9,r8,r9 782160814Ssimon addze r10,r10 783160814Ssimon $ST r11,`8*$BNSZ`(r3) #r[8]=c3; 784160814Ssimon #sqr_add_c2(a,7,2,c1,c2,c3); 785160814Ssimon $LD r5,`2*$BNSZ`(r4) 786160814Ssimon $UMULL r7,r5,r6 787160814Ssimon $UMULH r8,r5,r6 788160814Ssimon 789160814Ssimon addc r9,r7,r9 790160814Ssimon adde r10,r8,r10 791160814Ssimon addze r11,r0 792160814Ssimon addc r9,r7,r9 793160814Ssimon adde r10,r8,r10 794160814Ssimon addze r11,r11 795160814Ssimon #sqr_add_c2(a,6,3,c1,c2,c3); 796160814Ssimon $LD r5,`3*$BNSZ`(r4) 797160814Ssimon $LD r6,`6*$BNSZ`(r4) 798160814Ssimon $UMULL r7,r5,r6 799160814Ssimon $UMULH r8,r5,r6 800160814Ssimon addc r9,r7,r9 801160814Ssimon adde r10,r8,r10 802160814Ssimon addze r11,r11 803160814Ssimon addc r9,r7,r9 804160814Ssimon adde r10,r8,r10 805160814Ssimon addze r11,r11 806160814Ssimon #sqr_add_c2(a,5,4,c1,c2,c3); 807160814Ssimon $LD r5,`4*$BNSZ`(r4) 808160814Ssimon $LD r6,`5*$BNSZ`(r4) 809160814Ssimon $UMULL r7,r5,r6 810160814Ssimon $UMULH r8,r5,r6 811160814Ssimon addc r9,r7,r9 812160814Ssimon adde r10,r8,r10 813160814Ssimon addze r11,r11 814160814Ssimon addc r9,r7,r9 815160814Ssimon adde r10,r8,r10 816160814Ssimon addze r11,r11 817160814Ssimon $ST r9,`9*$BNSZ`(r3) #r[9]=c1; 818160814Ssimon #sqr_add_c(a,5,c2,c3,c1); 819160814Ssimon $UMULL r7,r6,r6 820160814Ssimon $UMULH r8,r6,r6 821160814Ssimon addc r10,r7,r10 822160814Ssimon adde r11,r8,r11 823160814Ssimon addze r9,r0 824160814Ssimon #sqr_add_c2(a,6,4,c2,c3,c1); 825160814Ssimon $LD r6,`6*$BNSZ`(r4) 826160814Ssimon $UMULL r7,r5,r6 827160814Ssimon $UMULH r8,r5,r6 828160814Ssimon addc r10,r7,r10 829160814Ssimon adde r11,r8,r11 830160814Ssimon addze r9,r9 831160814Ssimon addc r10,r7,r10 832160814Ssimon adde r11,r8,r11 833160814Ssimon addze r9,r9 834160814Ssimon #sqr_add_c2(a,7,3,c2,c3,c1); 835160814Ssimon $LD r5,`3*$BNSZ`(r4) 836160814Ssimon $LD r6,`7*$BNSZ`(r4) 837160814Ssimon $UMULL r7,r5,r6 838160814Ssimon $UMULH r8,r5,r6 839160814Ssimon addc r10,r7,r10 840160814Ssimon adde r11,r8,r11 841160814Ssimon addze r9,r9 842160814Ssimon addc r10,r7,r10 843160814Ssimon adde r11,r8,r11 844160814Ssimon addze r9,r9 845160814Ssimon $ST r10,`10*$BNSZ`(r3) #r[10]=c2; 846160814Ssimon #sqr_add_c2(a,7,4,c3,c1,c2); 847160814Ssimon $LD r5,`4*$BNSZ`(r4) 848160814Ssimon $UMULL r7,r5,r6 849160814Ssimon $UMULH r8,r5,r6 850160814Ssimon addc r11,r7,r11 851160814Ssimon adde r9,r8,r9 852160814Ssimon addze r10,r0 853160814Ssimon addc r11,r7,r11 854160814Ssimon adde r9,r8,r9 855160814Ssimon addze r10,r10 856160814Ssimon #sqr_add_c2(a,6,5,c3,c1,c2); 857160814Ssimon $LD r5,`5*$BNSZ`(r4) 858160814Ssimon $LD r6,`6*$BNSZ`(r4) 859160814Ssimon $UMULL r7,r5,r6 860160814Ssimon $UMULH r8,r5,r6 861160814Ssimon addc r11,r7,r11 862160814Ssimon adde r9,r8,r9 863160814Ssimon addze r10,r10 864160814Ssimon addc r11,r7,r11 865160814Ssimon adde r9,r8,r9 866160814Ssimon addze r10,r10 867160814Ssimon $ST r11,`11*$BNSZ`(r3) #r[11]=c3; 868160814Ssimon #sqr_add_c(a,6,c1,c2,c3); 869160814Ssimon $UMULL r7,r6,r6 870160814Ssimon $UMULH r8,r6,r6 871160814Ssimon addc r9,r7,r9 872160814Ssimon adde r10,r8,r10 873160814Ssimon addze r11,r0 874160814Ssimon #sqr_add_c2(a,7,5,c1,c2,c3) 875160814Ssimon $LD r6,`7*$BNSZ`(r4) 876160814Ssimon $UMULL r7,r5,r6 877160814Ssimon $UMULH r8,r5,r6 878160814Ssimon addc r9,r7,r9 879160814Ssimon adde r10,r8,r10 880160814Ssimon addze r11,r11 881160814Ssimon addc r9,r7,r9 882160814Ssimon adde r10,r8,r10 883160814Ssimon addze r11,r11 884160814Ssimon $ST r9,`12*$BNSZ`(r3) #r[12]=c1; 885160814Ssimon 886160814Ssimon #sqr_add_c2(a,7,6,c2,c3,c1) 887160814Ssimon $LD r5,`6*$BNSZ`(r4) 888160814Ssimon $UMULL r7,r5,r6 889160814Ssimon $UMULH r8,r5,r6 890160814Ssimon addc r10,r7,r10 891160814Ssimon adde r11,r8,r11 892160814Ssimon addze r9,r0 893160814Ssimon addc r10,r7,r10 894160814Ssimon adde r11,r8,r11 895160814Ssimon addze r9,r9 896160814Ssimon $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 897160814Ssimon #sqr_add_c(a,7,c3,c1,c2); 898160814Ssimon $UMULL r7,r6,r6 899160814Ssimon $UMULH r8,r6,r6 900160814Ssimon addc r11,r7,r11 901160814Ssimon adde r9,r8,r9 902160814Ssimon $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 903160814Ssimon $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 904160814Ssimon 905160814Ssimon 906160814Ssimon bclr BO_ALWAYS,CR0_LT 907160814Ssimon 908160814Ssimon .long 0x00000000 909160814Ssimon 910160814Ssimon# 911160814Ssimon# NOTE: The following label name should be changed to 912160814Ssimon# "bn_mul_comba4" i.e. remove the first dot 913160814Ssimon# for the gcc compiler. This should be automatically 914160814Ssimon# done in the build 915160814Ssimon# 916160814Ssimon 917160814Ssimon.align 4 918160814Ssimon.bn_mul_comba4: 919160814Ssimon# 920160814Ssimon# This is an optimized version of the bn_mul_comba4 routine. 921160814Ssimon# 922160814Ssimon# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 923160814Ssimon# r3 contains r 924160814Ssimon# r4 contains a 925160814Ssimon# r5 contains b 926160814Ssimon# r6, r7 are the 2 BN_ULONGs being multiplied. 927160814Ssimon# r8, r9 are the results of the 32x32 giving 64 multiply. 928160814Ssimon# r10, r11, r12 are the equivalents of c1, c2, and c3. 929160814Ssimon# 930160814Ssimon xor r0,r0,r0 #r0=0. Used in addze below. 931160814Ssimon #mul_add_c(a[0],b[0],c1,c2,c3); 932160814Ssimon $LD r6,`0*$BNSZ`(r4) 933160814Ssimon $LD r7,`0*$BNSZ`(r5) 934160814Ssimon $UMULL r10,r6,r7 935160814Ssimon $UMULH r11,r6,r7 936160814Ssimon $ST r10,`0*$BNSZ`(r3) #r[0]=c1 937160814Ssimon #mul_add_c(a[0],b[1],c2,c3,c1); 938160814Ssimon $LD r7,`1*$BNSZ`(r5) 939160814Ssimon $UMULL r8,r6,r7 940160814Ssimon $UMULH r9,r6,r7 941160814Ssimon addc r11,r8,r11 942160814Ssimon adde r12,r9,r0 943160814Ssimon addze r10,r0 944160814Ssimon #mul_add_c(a[1],b[0],c2,c3,c1); 945160814Ssimon $LD r6, `1*$BNSZ`(r4) 946160814Ssimon $LD r7, `0*$BNSZ`(r5) 947160814Ssimon $UMULL r8,r6,r7 948160814Ssimon $UMULH r9,r6,r7 949160814Ssimon addc r11,r8,r11 950160814Ssimon adde r12,r9,r12 951160814Ssimon addze r10,r10 952160814Ssimon $ST r11,`1*$BNSZ`(r3) #r[1]=c2 953160814Ssimon #mul_add_c(a[2],b[0],c3,c1,c2); 954160814Ssimon $LD r6,`2*$BNSZ`(r4) 955160814Ssimon $UMULL r8,r6,r7 956160814Ssimon $UMULH r9,r6,r7 957160814Ssimon addc r12,r8,r12 958160814Ssimon adde r10,r9,r10 959160814Ssimon addze r11,r0 960160814Ssimon #mul_add_c(a[1],b[1],c3,c1,c2); 961160814Ssimon $LD r6,`1*$BNSZ`(r4) 962160814Ssimon $LD r7,`1*$BNSZ`(r5) 963160814Ssimon $UMULL r8,r6,r7 964160814Ssimon $UMULH r9,r6,r7 965160814Ssimon addc r12,r8,r12 966160814Ssimon adde r10,r9,r10 967160814Ssimon addze r11,r11 968160814Ssimon #mul_add_c(a[0],b[2],c3,c1,c2); 969160814Ssimon $LD r6,`0*$BNSZ`(r4) 970160814Ssimon $LD r7,`2*$BNSZ`(r5) 971160814Ssimon $UMULL r8,r6,r7 972160814Ssimon $UMULH r9,r6,r7 973160814Ssimon addc r12,r8,r12 974160814Ssimon adde r10,r9,r10 975160814Ssimon addze r11,r11 976160814Ssimon $ST r12,`2*$BNSZ`(r3) #r[2]=c3 977160814Ssimon #mul_add_c(a[0],b[3],c1,c2,c3); 978160814Ssimon $LD r7,`3*$BNSZ`(r5) 979160814Ssimon $UMULL r8,r6,r7 980160814Ssimon $UMULH r9,r6,r7 981160814Ssimon addc r10,r8,r10 982160814Ssimon adde r11,r9,r11 983160814Ssimon addze r12,r0 984160814Ssimon #mul_add_c(a[1],b[2],c1,c2,c3); 985160814Ssimon $LD r6,`1*$BNSZ`(r4) 986160814Ssimon $LD r7,`2*$BNSZ`(r5) 987160814Ssimon $UMULL r8,r6,r7 988160814Ssimon $UMULH r9,r6,r7 989160814Ssimon addc r10,r8,r10 990160814Ssimon adde r11,r9,r11 991160814Ssimon addze r12,r12 992160814Ssimon #mul_add_c(a[2],b[1],c1,c2,c3); 993160814Ssimon $LD r6,`2*$BNSZ`(r4) 994160814Ssimon $LD r7,`1*$BNSZ`(r5) 995160814Ssimon $UMULL r8,r6,r7 996160814Ssimon $UMULH r9,r6,r7 997160814Ssimon addc r10,r8,r10 998160814Ssimon adde r11,r9,r11 999160814Ssimon addze r12,r12 1000160814Ssimon #mul_add_c(a[3],b[0],c1,c2,c3); 1001160814Ssimon $LD r6,`3*$BNSZ`(r4) 1002160814Ssimon $LD r7,`0*$BNSZ`(r5) 1003160814Ssimon $UMULL r8,r6,r7 1004160814Ssimon $UMULH r9,r6,r7 1005160814Ssimon addc r10,r8,r10 1006160814Ssimon adde r11,r9,r11 1007160814Ssimon addze r12,r12 1008160814Ssimon $ST r10,`3*$BNSZ`(r3) #r[3]=c1 1009160814Ssimon #mul_add_c(a[3],b[1],c2,c3,c1); 1010160814Ssimon $LD r7,`1*$BNSZ`(r5) 1011160814Ssimon $UMULL r8,r6,r7 1012160814Ssimon $UMULH r9,r6,r7 1013160814Ssimon addc r11,r8,r11 1014160814Ssimon adde r12,r9,r12 1015160814Ssimon addze r10,r0 1016160814Ssimon #mul_add_c(a[2],b[2],c2,c3,c1); 1017160814Ssimon $LD r6,`2*$BNSZ`(r4) 1018160814Ssimon $LD r7,`2*$BNSZ`(r5) 1019160814Ssimon $UMULL r8,r6,r7 1020160814Ssimon $UMULH r9,r6,r7 1021160814Ssimon addc r11,r8,r11 1022160814Ssimon adde r12,r9,r12 1023160814Ssimon addze r10,r10 1024160814Ssimon #mul_add_c(a[1],b[3],c2,c3,c1); 1025160814Ssimon $LD r6,`1*$BNSZ`(r4) 1026160814Ssimon $LD r7,`3*$BNSZ`(r5) 1027160814Ssimon $UMULL r8,r6,r7 1028160814Ssimon $UMULH r9,r6,r7 1029160814Ssimon addc r11,r8,r11 1030160814Ssimon adde r12,r9,r12 1031160814Ssimon addze r10,r10 1032160814Ssimon $ST r11,`4*$BNSZ`(r3) #r[4]=c2 1033160814Ssimon #mul_add_c(a[2],b[3],c3,c1,c2); 1034160814Ssimon $LD r6,`2*$BNSZ`(r4) 1035160814Ssimon $UMULL r8,r6,r7 1036160814Ssimon $UMULH r9,r6,r7 1037160814Ssimon addc r12,r8,r12 1038160814Ssimon adde r10,r9,r10 1039160814Ssimon addze r11,r0 1040160814Ssimon #mul_add_c(a[3],b[2],c3,c1,c2); 1041160814Ssimon $LD r6,`3*$BNSZ`(r4) 1042237998Sjkim $LD r7,`2*$BNSZ`(r5) 1043160814Ssimon $UMULL r8,r6,r7 1044160814Ssimon $UMULH r9,r6,r7 1045160814Ssimon addc r12,r8,r12 1046160814Ssimon adde r10,r9,r10 1047160814Ssimon addze r11,r11 1048160814Ssimon $ST r12,`5*$BNSZ`(r3) #r[5]=c3 1049160814Ssimon #mul_add_c(a[3],b[3],c1,c2,c3); 1050160814Ssimon $LD r7,`3*$BNSZ`(r5) 1051160814Ssimon $UMULL r8,r6,r7 1052160814Ssimon $UMULH r9,r6,r7 1053160814Ssimon addc r10,r8,r10 1054160814Ssimon adde r11,r9,r11 1055160814Ssimon 1056160814Ssimon $ST r10,`6*$BNSZ`(r3) #r[6]=c1 1057160814Ssimon $ST r11,`7*$BNSZ`(r3) #r[7]=c2 1058160814Ssimon bclr BO_ALWAYS,CR0_LT 1059160814Ssimon .long 0x00000000 1060160814Ssimon 1061160814Ssimon# 1062160814Ssimon# NOTE: The following label name should be changed to 1063160814Ssimon# "bn_mul_comba8" i.e. remove the first dot 1064160814Ssimon# for the gcc compiler. This should be automatically 1065160814Ssimon# done in the build 1066160814Ssimon# 1067160814Ssimon 1068160814Ssimon.align 4 1069160814Ssimon.bn_mul_comba8: 1070160814Ssimon# 1071160814Ssimon# Optimized version of the bn_mul_comba8 routine. 1072160814Ssimon# 1073160814Ssimon# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1074160814Ssimon# r3 contains r 1075160814Ssimon# r4 contains a 1076160814Ssimon# r5 contains b 1077160814Ssimon# r6, r7 are the 2 BN_ULONGs being multiplied. 1078160814Ssimon# r8, r9 are the results of the 32x32 giving 64 multiply. 1079160814Ssimon# r10, r11, r12 are the equivalents of c1, c2, and c3. 1080160814Ssimon# 1081160814Ssimon xor r0,r0,r0 #r0=0. Used in addze below. 1082160814Ssimon 1083160814Ssimon #mul_add_c(a[0],b[0],c1,c2,c3); 1084160814Ssimon $LD r6,`0*$BNSZ`(r4) #a[0] 1085160814Ssimon $LD r7,`0*$BNSZ`(r5) #b[0] 1086160814Ssimon $UMULL r10,r6,r7 1087160814Ssimon $UMULH r11,r6,r7 1088160814Ssimon $ST r10,`0*$BNSZ`(r3) #r[0]=c1; 1089160814Ssimon #mul_add_c(a[0],b[1],c2,c3,c1); 1090160814Ssimon $LD r7,`1*$BNSZ`(r5) 1091160814Ssimon $UMULL r8,r6,r7 1092160814Ssimon $UMULH r9,r6,r7 1093160814Ssimon addc r11,r11,r8 1094160814Ssimon addze r12,r9 # since we didnt set r12 to zero before. 1095160814Ssimon addze r10,r0 1096160814Ssimon #mul_add_c(a[1],b[0],c2,c3,c1); 1097160814Ssimon $LD r6,`1*$BNSZ`(r4) 1098160814Ssimon $LD r7,`0*$BNSZ`(r5) 1099160814Ssimon $UMULL r8,r6,r7 1100160814Ssimon $UMULH r9,r6,r7 1101160814Ssimon addc r11,r11,r8 1102160814Ssimon adde r12,r12,r9 1103160814Ssimon addze r10,r10 1104160814Ssimon $ST r11,`1*$BNSZ`(r3) #r[1]=c2; 1105160814Ssimon #mul_add_c(a[2],b[0],c3,c1,c2); 1106160814Ssimon $LD r6,`2*$BNSZ`(r4) 1107160814Ssimon $UMULL r8,r6,r7 1108160814Ssimon $UMULH r9,r6,r7 1109160814Ssimon addc r12,r12,r8 1110160814Ssimon adde r10,r10,r9 1111160814Ssimon addze r11,r0 1112160814Ssimon #mul_add_c(a[1],b[1],c3,c1,c2); 1113160814Ssimon $LD r6,`1*$BNSZ`(r4) 1114160814Ssimon $LD r7,`1*$BNSZ`(r5) 1115160814Ssimon $UMULL r8,r6,r7 1116160814Ssimon $UMULH r9,r6,r7 1117160814Ssimon addc r12,r12,r8 1118160814Ssimon adde r10,r10,r9 1119160814Ssimon addze r11,r11 1120160814Ssimon #mul_add_c(a[0],b[2],c3,c1,c2); 1121160814Ssimon $LD r6,`0*$BNSZ`(r4) 1122160814Ssimon $LD r7,`2*$BNSZ`(r5) 1123160814Ssimon $UMULL r8,r6,r7 1124160814Ssimon $UMULH r9,r6,r7 1125160814Ssimon addc r12,r12,r8 1126160814Ssimon adde r10,r10,r9 1127160814Ssimon addze r11,r11 1128160814Ssimon $ST r12,`2*$BNSZ`(r3) #r[2]=c3; 1129160814Ssimon #mul_add_c(a[0],b[3],c1,c2,c3); 1130160814Ssimon $LD r7,`3*$BNSZ`(r5) 1131160814Ssimon $UMULL r8,r6,r7 1132160814Ssimon $UMULH r9,r6,r7 1133160814Ssimon addc r10,r10,r8 1134160814Ssimon adde r11,r11,r9 1135160814Ssimon addze r12,r0 1136160814Ssimon #mul_add_c(a[1],b[2],c1,c2,c3); 1137160814Ssimon $LD r6,`1*$BNSZ`(r4) 1138160814Ssimon $LD r7,`2*$BNSZ`(r5) 1139160814Ssimon $UMULL r8,r6,r7 1140160814Ssimon $UMULH r9,r6,r7 1141160814Ssimon addc r10,r10,r8 1142160814Ssimon adde r11,r11,r9 1143160814Ssimon addze r12,r12 1144160814Ssimon 1145160814Ssimon #mul_add_c(a[2],b[1],c1,c2,c3); 1146160814Ssimon $LD r6,`2*$BNSZ`(r4) 1147160814Ssimon $LD r7,`1*$BNSZ`(r5) 1148160814Ssimon $UMULL r8,r6,r7 1149160814Ssimon $UMULH r9,r6,r7 1150160814Ssimon addc r10,r10,r8 1151160814Ssimon adde r11,r11,r9 1152160814Ssimon addze r12,r12 1153160814Ssimon #mul_add_c(a[3],b[0],c1,c2,c3); 1154160814Ssimon $LD r6,`3*$BNSZ`(r4) 1155160814Ssimon $LD r7,`0*$BNSZ`(r5) 1156160814Ssimon $UMULL r8,r6,r7 1157160814Ssimon $UMULH r9,r6,r7 1158160814Ssimon addc r10,r10,r8 1159160814Ssimon adde r11,r11,r9 1160160814Ssimon addze r12,r12 1161160814Ssimon $ST r10,`3*$BNSZ`(r3) #r[3]=c1; 1162160814Ssimon #mul_add_c(a[4],b[0],c2,c3,c1); 1163160814Ssimon $LD r6,`4*$BNSZ`(r4) 1164160814Ssimon $UMULL r8,r6,r7 1165160814Ssimon $UMULH r9,r6,r7 1166160814Ssimon addc r11,r11,r8 1167160814Ssimon adde r12,r12,r9 1168160814Ssimon addze r10,r0 1169160814Ssimon #mul_add_c(a[3],b[1],c2,c3,c1); 1170160814Ssimon $LD r6,`3*$BNSZ`(r4) 1171160814Ssimon $LD r7,`1*$BNSZ`(r5) 1172160814Ssimon $UMULL r8,r6,r7 1173160814Ssimon $UMULH r9,r6,r7 1174160814Ssimon addc r11,r11,r8 1175160814Ssimon adde r12,r12,r9 1176160814Ssimon addze r10,r10 1177160814Ssimon #mul_add_c(a[2],b[2],c2,c3,c1); 1178160814Ssimon $LD r6,`2*$BNSZ`(r4) 1179160814Ssimon $LD r7,`2*$BNSZ`(r5) 1180160814Ssimon $UMULL r8,r6,r7 1181160814Ssimon $UMULH r9,r6,r7 1182160814Ssimon addc r11,r11,r8 1183160814Ssimon adde r12,r12,r9 1184160814Ssimon addze r10,r10 1185160814Ssimon #mul_add_c(a[1],b[3],c2,c3,c1); 1186160814Ssimon $LD r6,`1*$BNSZ`(r4) 1187160814Ssimon $LD r7,`3*$BNSZ`(r5) 1188160814Ssimon $UMULL r8,r6,r7 1189160814Ssimon $UMULH r9,r6,r7 1190160814Ssimon addc r11,r11,r8 1191160814Ssimon adde r12,r12,r9 1192160814Ssimon addze r10,r10 1193160814Ssimon #mul_add_c(a[0],b[4],c2,c3,c1); 1194160814Ssimon $LD r6,`0*$BNSZ`(r4) 1195160814Ssimon $LD r7,`4*$BNSZ`(r5) 1196160814Ssimon $UMULL r8,r6,r7 1197160814Ssimon $UMULH r9,r6,r7 1198160814Ssimon addc r11,r11,r8 1199160814Ssimon adde r12,r12,r9 1200160814Ssimon addze r10,r10 1201160814Ssimon $ST r11,`4*$BNSZ`(r3) #r[4]=c2; 1202160814Ssimon #mul_add_c(a[0],b[5],c3,c1,c2); 1203160814Ssimon $LD r7,`5*$BNSZ`(r5) 1204160814Ssimon $UMULL r8,r6,r7 1205160814Ssimon $UMULH r9,r6,r7 1206160814Ssimon addc r12,r12,r8 1207160814Ssimon adde r10,r10,r9 1208160814Ssimon addze r11,r0 1209160814Ssimon #mul_add_c(a[1],b[4],c3,c1,c2); 1210160814Ssimon $LD r6,`1*$BNSZ`(r4) 1211160814Ssimon $LD r7,`4*$BNSZ`(r5) 1212160814Ssimon $UMULL r8,r6,r7 1213160814Ssimon $UMULH r9,r6,r7 1214160814Ssimon addc r12,r12,r8 1215160814Ssimon adde r10,r10,r9 1216160814Ssimon addze r11,r11 1217160814Ssimon #mul_add_c(a[2],b[3],c3,c1,c2); 1218160814Ssimon $LD r6,`2*$BNSZ`(r4) 1219160814Ssimon $LD r7,`3*$BNSZ`(r5) 1220160814Ssimon $UMULL r8,r6,r7 1221160814Ssimon $UMULH r9,r6,r7 1222160814Ssimon addc r12,r12,r8 1223160814Ssimon adde r10,r10,r9 1224160814Ssimon addze r11,r11 1225160814Ssimon #mul_add_c(a[3],b[2],c3,c1,c2); 1226160814Ssimon $LD r6,`3*$BNSZ`(r4) 1227160814Ssimon $LD r7,`2*$BNSZ`(r5) 1228160814Ssimon $UMULL r8,r6,r7 1229160814Ssimon $UMULH r9,r6,r7 1230160814Ssimon addc r12,r12,r8 1231160814Ssimon adde r10,r10,r9 1232160814Ssimon addze r11,r11 1233160814Ssimon #mul_add_c(a[4],b[1],c3,c1,c2); 1234160814Ssimon $LD r6,`4*$BNSZ`(r4) 1235160814Ssimon $LD r7,`1*$BNSZ`(r5) 1236160814Ssimon $UMULL r8,r6,r7 1237160814Ssimon $UMULH r9,r6,r7 1238160814Ssimon addc r12,r12,r8 1239160814Ssimon adde r10,r10,r9 1240160814Ssimon addze r11,r11 1241160814Ssimon #mul_add_c(a[5],b[0],c3,c1,c2); 1242160814Ssimon $LD r6,`5*$BNSZ`(r4) 1243160814Ssimon $LD r7,`0*$BNSZ`(r5) 1244160814Ssimon $UMULL r8,r6,r7 1245160814Ssimon $UMULH r9,r6,r7 1246160814Ssimon addc r12,r12,r8 1247160814Ssimon adde r10,r10,r9 1248160814Ssimon addze r11,r11 1249160814Ssimon $ST r12,`5*$BNSZ`(r3) #r[5]=c3; 1250160814Ssimon #mul_add_c(a[6],b[0],c1,c2,c3); 1251160814Ssimon $LD r6,`6*$BNSZ`(r4) 1252160814Ssimon $UMULL r8,r6,r7 1253160814Ssimon $UMULH r9,r6,r7 1254160814Ssimon addc r10,r10,r8 1255160814Ssimon adde r11,r11,r9 1256160814Ssimon addze r12,r0 1257160814Ssimon #mul_add_c(a[5],b[1],c1,c2,c3); 1258160814Ssimon $LD r6,`5*$BNSZ`(r4) 1259160814Ssimon $LD r7,`1*$BNSZ`(r5) 1260160814Ssimon $UMULL r8,r6,r7 1261160814Ssimon $UMULH r9,r6,r7 1262160814Ssimon addc r10,r10,r8 1263160814Ssimon adde r11,r11,r9 1264160814Ssimon addze r12,r12 1265160814Ssimon #mul_add_c(a[4],b[2],c1,c2,c3); 1266160814Ssimon $LD r6,`4*$BNSZ`(r4) 1267160814Ssimon $LD r7,`2*$BNSZ`(r5) 1268160814Ssimon $UMULL r8,r6,r7 1269160814Ssimon $UMULH r9,r6,r7 1270160814Ssimon addc r10,r10,r8 1271160814Ssimon adde r11,r11,r9 1272160814Ssimon addze r12,r12 1273160814Ssimon #mul_add_c(a[3],b[3],c1,c2,c3); 1274160814Ssimon $LD r6,`3*$BNSZ`(r4) 1275160814Ssimon $LD r7,`3*$BNSZ`(r5) 1276160814Ssimon $UMULL r8,r6,r7 1277160814Ssimon $UMULH r9,r6,r7 1278160814Ssimon addc r10,r10,r8 1279160814Ssimon adde r11,r11,r9 1280160814Ssimon addze r12,r12 1281160814Ssimon #mul_add_c(a[2],b[4],c1,c2,c3); 1282160814Ssimon $LD r6,`2*$BNSZ`(r4) 1283160814Ssimon $LD r7,`4*$BNSZ`(r5) 1284160814Ssimon $UMULL r8,r6,r7 1285160814Ssimon $UMULH r9,r6,r7 1286160814Ssimon addc r10,r10,r8 1287160814Ssimon adde r11,r11,r9 1288160814Ssimon addze r12,r12 1289160814Ssimon #mul_add_c(a[1],b[5],c1,c2,c3); 1290160814Ssimon $LD r6,`1*$BNSZ`(r4) 1291160814Ssimon $LD r7,`5*$BNSZ`(r5) 1292160814Ssimon $UMULL r8,r6,r7 1293160814Ssimon $UMULH r9,r6,r7 1294160814Ssimon addc r10,r10,r8 1295160814Ssimon adde r11,r11,r9 1296160814Ssimon addze r12,r12 1297160814Ssimon #mul_add_c(a[0],b[6],c1,c2,c3); 1298160814Ssimon $LD r6,`0*$BNSZ`(r4) 1299160814Ssimon $LD r7,`6*$BNSZ`(r5) 1300160814Ssimon $UMULL r8,r6,r7 1301160814Ssimon $UMULH r9,r6,r7 1302160814Ssimon addc r10,r10,r8 1303160814Ssimon adde r11,r11,r9 1304160814Ssimon addze r12,r12 1305160814Ssimon $ST r10,`6*$BNSZ`(r3) #r[6]=c1; 1306160814Ssimon #mul_add_c(a[0],b[7],c2,c3,c1); 1307160814Ssimon $LD r7,`7*$BNSZ`(r5) 1308160814Ssimon $UMULL r8,r6,r7 1309160814Ssimon $UMULH r9,r6,r7 1310160814Ssimon addc r11,r11,r8 1311160814Ssimon adde r12,r12,r9 1312160814Ssimon addze r10,r0 1313160814Ssimon #mul_add_c(a[1],b[6],c2,c3,c1); 1314160814Ssimon $LD r6,`1*$BNSZ`(r4) 1315160814Ssimon $LD r7,`6*$BNSZ`(r5) 1316160814Ssimon $UMULL r8,r6,r7 1317160814Ssimon $UMULH r9,r6,r7 1318160814Ssimon addc r11,r11,r8 1319160814Ssimon adde r12,r12,r9 1320160814Ssimon addze r10,r10 1321160814Ssimon #mul_add_c(a[2],b[5],c2,c3,c1); 1322160814Ssimon $LD r6,`2*$BNSZ`(r4) 1323160814Ssimon $LD r7,`5*$BNSZ`(r5) 1324160814Ssimon $UMULL r8,r6,r7 1325160814Ssimon $UMULH r9,r6,r7 1326160814Ssimon addc r11,r11,r8 1327160814Ssimon adde r12,r12,r9 1328160814Ssimon addze r10,r10 1329160814Ssimon #mul_add_c(a[3],b[4],c2,c3,c1); 1330160814Ssimon $LD r6,`3*$BNSZ`(r4) 1331160814Ssimon $LD r7,`4*$BNSZ`(r5) 1332160814Ssimon $UMULL r8,r6,r7 1333160814Ssimon $UMULH r9,r6,r7 1334160814Ssimon addc r11,r11,r8 1335160814Ssimon adde r12,r12,r9 1336160814Ssimon addze r10,r10 1337160814Ssimon #mul_add_c(a[4],b[3],c2,c3,c1); 1338160814Ssimon $LD r6,`4*$BNSZ`(r4) 1339160814Ssimon $LD r7,`3*$BNSZ`(r5) 1340160814Ssimon $UMULL r8,r6,r7 1341160814Ssimon $UMULH r9,r6,r7 1342160814Ssimon addc r11,r11,r8 1343160814Ssimon adde r12,r12,r9 1344160814Ssimon addze r10,r10 1345160814Ssimon #mul_add_c(a[5],b[2],c2,c3,c1); 1346160814Ssimon $LD r6,`5*$BNSZ`(r4) 1347160814Ssimon $LD r7,`2*$BNSZ`(r5) 1348160814Ssimon $UMULL r8,r6,r7 1349160814Ssimon $UMULH r9,r6,r7 1350160814Ssimon addc r11,r11,r8 1351160814Ssimon adde r12,r12,r9 1352160814Ssimon addze r10,r10 1353160814Ssimon #mul_add_c(a[6],b[1],c2,c3,c1); 1354160814Ssimon $LD r6,`6*$BNSZ`(r4) 1355160814Ssimon $LD r7,`1*$BNSZ`(r5) 1356160814Ssimon $UMULL r8,r6,r7 1357160814Ssimon $UMULH r9,r6,r7 1358160814Ssimon addc r11,r11,r8 1359160814Ssimon adde r12,r12,r9 1360160814Ssimon addze r10,r10 1361160814Ssimon #mul_add_c(a[7],b[0],c2,c3,c1); 1362160814Ssimon $LD r6,`7*$BNSZ`(r4) 1363160814Ssimon $LD r7,`0*$BNSZ`(r5) 1364160814Ssimon $UMULL r8,r6,r7 1365160814Ssimon $UMULH r9,r6,r7 1366160814Ssimon addc r11,r11,r8 1367160814Ssimon adde r12,r12,r9 1368160814Ssimon addze r10,r10 1369160814Ssimon $ST r11,`7*$BNSZ`(r3) #r[7]=c2; 1370160814Ssimon #mul_add_c(a[7],b[1],c3,c1,c2); 1371160814Ssimon $LD r7,`1*$BNSZ`(r5) 1372160814Ssimon $UMULL r8,r6,r7 1373160814Ssimon $UMULH r9,r6,r7 1374160814Ssimon addc r12,r12,r8 1375160814Ssimon adde r10,r10,r9 1376160814Ssimon addze r11,r0 1377160814Ssimon #mul_add_c(a[6],b[2],c3,c1,c2); 1378160814Ssimon $LD r6,`6*$BNSZ`(r4) 1379160814Ssimon $LD r7,`2*$BNSZ`(r5) 1380160814Ssimon $UMULL r8,r6,r7 1381160814Ssimon $UMULH r9,r6,r7 1382160814Ssimon addc r12,r12,r8 1383160814Ssimon adde r10,r10,r9 1384160814Ssimon addze r11,r11 1385160814Ssimon #mul_add_c(a[5],b[3],c3,c1,c2); 1386160814Ssimon $LD r6,`5*$BNSZ`(r4) 1387160814Ssimon $LD r7,`3*$BNSZ`(r5) 1388160814Ssimon $UMULL r8,r6,r7 1389160814Ssimon $UMULH r9,r6,r7 1390160814Ssimon addc r12,r12,r8 1391160814Ssimon adde r10,r10,r9 1392160814Ssimon addze r11,r11 1393160814Ssimon #mul_add_c(a[4],b[4],c3,c1,c2); 1394160814Ssimon $LD r6,`4*$BNSZ`(r4) 1395160814Ssimon $LD r7,`4*$BNSZ`(r5) 1396160814Ssimon $UMULL r8,r6,r7 1397160814Ssimon $UMULH r9,r6,r7 1398160814Ssimon addc r12,r12,r8 1399160814Ssimon adde r10,r10,r9 1400160814Ssimon addze r11,r11 1401160814Ssimon #mul_add_c(a[3],b[5],c3,c1,c2); 1402160814Ssimon $LD r6,`3*$BNSZ`(r4) 1403160814Ssimon $LD r7,`5*$BNSZ`(r5) 1404160814Ssimon $UMULL r8,r6,r7 1405160814Ssimon $UMULH r9,r6,r7 1406160814Ssimon addc r12,r12,r8 1407160814Ssimon adde r10,r10,r9 1408160814Ssimon addze r11,r11 1409160814Ssimon #mul_add_c(a[2],b[6],c3,c1,c2); 1410160814Ssimon $LD r6,`2*$BNSZ`(r4) 1411160814Ssimon $LD r7,`6*$BNSZ`(r5) 1412160814Ssimon $UMULL r8,r6,r7 1413160814Ssimon $UMULH r9,r6,r7 1414160814Ssimon addc r12,r12,r8 1415160814Ssimon adde r10,r10,r9 1416160814Ssimon addze r11,r11 1417160814Ssimon #mul_add_c(a[1],b[7],c3,c1,c2); 1418160814Ssimon $LD r6,`1*$BNSZ`(r4) 1419160814Ssimon $LD r7,`7*$BNSZ`(r5) 1420160814Ssimon $UMULL r8,r6,r7 1421160814Ssimon $UMULH r9,r6,r7 1422160814Ssimon addc r12,r12,r8 1423160814Ssimon adde r10,r10,r9 1424160814Ssimon addze r11,r11 1425160814Ssimon $ST r12,`8*$BNSZ`(r3) #r[8]=c3; 1426160814Ssimon #mul_add_c(a[2],b[7],c1,c2,c3); 1427160814Ssimon $LD r6,`2*$BNSZ`(r4) 1428160814Ssimon $UMULL r8,r6,r7 1429160814Ssimon $UMULH r9,r6,r7 1430160814Ssimon addc r10,r10,r8 1431160814Ssimon adde r11,r11,r9 1432160814Ssimon addze r12,r0 1433160814Ssimon #mul_add_c(a[3],b[6],c1,c2,c3); 1434160814Ssimon $LD r6,`3*$BNSZ`(r4) 1435160814Ssimon $LD r7,`6*$BNSZ`(r5) 1436160814Ssimon $UMULL r8,r6,r7 1437160814Ssimon $UMULH r9,r6,r7 1438160814Ssimon addc r10,r10,r8 1439160814Ssimon adde r11,r11,r9 1440160814Ssimon addze r12,r12 1441160814Ssimon #mul_add_c(a[4],b[5],c1,c2,c3); 1442160814Ssimon $LD r6,`4*$BNSZ`(r4) 1443160814Ssimon $LD r7,`5*$BNSZ`(r5) 1444160814Ssimon $UMULL r8,r6,r7 1445160814Ssimon $UMULH r9,r6,r7 1446160814Ssimon addc r10,r10,r8 1447160814Ssimon adde r11,r11,r9 1448160814Ssimon addze r12,r12 1449160814Ssimon #mul_add_c(a[5],b[4],c1,c2,c3); 1450160814Ssimon $LD r6,`5*$BNSZ`(r4) 1451160814Ssimon $LD r7,`4*$BNSZ`(r5) 1452160814Ssimon $UMULL r8,r6,r7 1453160814Ssimon $UMULH r9,r6,r7 1454160814Ssimon addc r10,r10,r8 1455160814Ssimon adde r11,r11,r9 1456160814Ssimon addze r12,r12 1457160814Ssimon #mul_add_c(a[6],b[3],c1,c2,c3); 1458160814Ssimon $LD r6,`6*$BNSZ`(r4) 1459160814Ssimon $LD r7,`3*$BNSZ`(r5) 1460160814Ssimon $UMULL r8,r6,r7 1461160814Ssimon $UMULH r9,r6,r7 1462160814Ssimon addc r10,r10,r8 1463160814Ssimon adde r11,r11,r9 1464160814Ssimon addze r12,r12 1465160814Ssimon #mul_add_c(a[7],b[2],c1,c2,c3); 1466160814Ssimon $LD r6,`7*$BNSZ`(r4) 1467160814Ssimon $LD r7,`2*$BNSZ`(r5) 1468160814Ssimon $UMULL r8,r6,r7 1469160814Ssimon $UMULH r9,r6,r7 1470160814Ssimon addc r10,r10,r8 1471160814Ssimon adde r11,r11,r9 1472160814Ssimon addze r12,r12 1473160814Ssimon $ST r10,`9*$BNSZ`(r3) #r[9]=c1; 1474160814Ssimon #mul_add_c(a[7],b[3],c2,c3,c1); 1475160814Ssimon $LD r7,`3*$BNSZ`(r5) 1476160814Ssimon $UMULL r8,r6,r7 1477160814Ssimon $UMULH r9,r6,r7 1478160814Ssimon addc r11,r11,r8 1479160814Ssimon adde r12,r12,r9 1480160814Ssimon addze r10,r0 1481160814Ssimon #mul_add_c(a[6],b[4],c2,c3,c1); 1482160814Ssimon $LD r6,`6*$BNSZ`(r4) 1483160814Ssimon $LD r7,`4*$BNSZ`(r5) 1484160814Ssimon $UMULL r8,r6,r7 1485160814Ssimon $UMULH r9,r6,r7 1486160814Ssimon addc r11,r11,r8 1487160814Ssimon adde r12,r12,r9 1488160814Ssimon addze r10,r10 1489160814Ssimon #mul_add_c(a[5],b[5],c2,c3,c1); 1490160814Ssimon $LD r6,`5*$BNSZ`(r4) 1491160814Ssimon $LD r7,`5*$BNSZ`(r5) 1492160814Ssimon $UMULL r8,r6,r7 1493160814Ssimon $UMULH r9,r6,r7 1494160814Ssimon addc r11,r11,r8 1495160814Ssimon adde r12,r12,r9 1496160814Ssimon addze r10,r10 1497160814Ssimon #mul_add_c(a[4],b[6],c2,c3,c1); 1498160814Ssimon $LD r6,`4*$BNSZ`(r4) 1499160814Ssimon $LD r7,`6*$BNSZ`(r5) 1500160814Ssimon $UMULL r8,r6,r7 1501160814Ssimon $UMULH r9,r6,r7 1502160814Ssimon addc r11,r11,r8 1503160814Ssimon adde r12,r12,r9 1504160814Ssimon addze r10,r10 1505160814Ssimon #mul_add_c(a[3],b[7],c2,c3,c1); 1506160814Ssimon $LD r6,`3*$BNSZ`(r4) 1507160814Ssimon $LD r7,`7*$BNSZ`(r5) 1508160814Ssimon $UMULL r8,r6,r7 1509160814Ssimon $UMULH r9,r6,r7 1510160814Ssimon addc r11,r11,r8 1511160814Ssimon adde r12,r12,r9 1512160814Ssimon addze r10,r10 1513160814Ssimon $ST r11,`10*$BNSZ`(r3) #r[10]=c2; 1514160814Ssimon #mul_add_c(a[4],b[7],c3,c1,c2); 1515160814Ssimon $LD r6,`4*$BNSZ`(r4) 1516160814Ssimon $UMULL r8,r6,r7 1517160814Ssimon $UMULH r9,r6,r7 1518160814Ssimon addc r12,r12,r8 1519160814Ssimon adde r10,r10,r9 1520160814Ssimon addze r11,r0 1521160814Ssimon #mul_add_c(a[5],b[6],c3,c1,c2); 1522160814Ssimon $LD r6,`5*$BNSZ`(r4) 1523160814Ssimon $LD r7,`6*$BNSZ`(r5) 1524160814Ssimon $UMULL r8,r6,r7 1525160814Ssimon $UMULH r9,r6,r7 1526160814Ssimon addc r12,r12,r8 1527160814Ssimon adde r10,r10,r9 1528160814Ssimon addze r11,r11 1529160814Ssimon #mul_add_c(a[6],b[5],c3,c1,c2); 1530160814Ssimon $LD r6,`6*$BNSZ`(r4) 1531160814Ssimon $LD r7,`5*$BNSZ`(r5) 1532160814Ssimon $UMULL r8,r6,r7 1533160814Ssimon $UMULH r9,r6,r7 1534160814Ssimon addc r12,r12,r8 1535160814Ssimon adde r10,r10,r9 1536160814Ssimon addze r11,r11 1537160814Ssimon #mul_add_c(a[7],b[4],c3,c1,c2); 1538160814Ssimon $LD r6,`7*$BNSZ`(r4) 1539160814Ssimon $LD r7,`4*$BNSZ`(r5) 1540160814Ssimon $UMULL r8,r6,r7 1541160814Ssimon $UMULH r9,r6,r7 1542160814Ssimon addc r12,r12,r8 1543160814Ssimon adde r10,r10,r9 1544160814Ssimon addze r11,r11 1545160814Ssimon $ST r12,`11*$BNSZ`(r3) #r[11]=c3; 1546160814Ssimon #mul_add_c(a[7],b[5],c1,c2,c3); 1547160814Ssimon $LD r7,`5*$BNSZ`(r5) 1548160814Ssimon $UMULL r8,r6,r7 1549160814Ssimon $UMULH r9,r6,r7 1550160814Ssimon addc r10,r10,r8 1551160814Ssimon adde r11,r11,r9 1552160814Ssimon addze r12,r0 1553160814Ssimon #mul_add_c(a[6],b[6],c1,c2,c3); 1554160814Ssimon $LD r6,`6*$BNSZ`(r4) 1555160814Ssimon $LD r7,`6*$BNSZ`(r5) 1556160814Ssimon $UMULL r8,r6,r7 1557160814Ssimon $UMULH r9,r6,r7 1558160814Ssimon addc r10,r10,r8 1559160814Ssimon adde r11,r11,r9 1560160814Ssimon addze r12,r12 1561160814Ssimon #mul_add_c(a[5],b[7],c1,c2,c3); 1562160814Ssimon $LD r6,`5*$BNSZ`(r4) 1563160814Ssimon $LD r7,`7*$BNSZ`(r5) 1564160814Ssimon $UMULL r8,r6,r7 1565160814Ssimon $UMULH r9,r6,r7 1566160814Ssimon addc r10,r10,r8 1567160814Ssimon adde r11,r11,r9 1568160814Ssimon addze r12,r12 1569160814Ssimon $ST r10,`12*$BNSZ`(r3) #r[12]=c1; 1570160814Ssimon #mul_add_c(a[6],b[7],c2,c3,c1); 1571160814Ssimon $LD r6,`6*$BNSZ`(r4) 1572160814Ssimon $UMULL r8,r6,r7 1573160814Ssimon $UMULH r9,r6,r7 1574160814Ssimon addc r11,r11,r8 1575160814Ssimon adde r12,r12,r9 1576160814Ssimon addze r10,r0 1577160814Ssimon #mul_add_c(a[7],b[6],c2,c3,c1); 1578160814Ssimon $LD r6,`7*$BNSZ`(r4) 1579160814Ssimon $LD r7,`6*$BNSZ`(r5) 1580160814Ssimon $UMULL r8,r6,r7 1581160814Ssimon $UMULH r9,r6,r7 1582160814Ssimon addc r11,r11,r8 1583160814Ssimon adde r12,r12,r9 1584160814Ssimon addze r10,r10 1585160814Ssimon $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 1586160814Ssimon #mul_add_c(a[7],b[7],c3,c1,c2); 1587160814Ssimon $LD r7,`7*$BNSZ`(r5) 1588160814Ssimon $UMULL r8,r6,r7 1589160814Ssimon $UMULH r9,r6,r7 1590160814Ssimon addc r12,r12,r8 1591160814Ssimon adde r10,r10,r9 1592160814Ssimon $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1593160814Ssimon $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1594160814Ssimon bclr BO_ALWAYS,CR0_LT 1595160814Ssimon .long 0x00000000 1596160814Ssimon 1597160814Ssimon# 1598160814Ssimon# NOTE: The following label name should be changed to 1599160814Ssimon# "bn_sub_words" i.e. remove the first dot 1600160814Ssimon# for the gcc compiler. This should be automatically 1601160814Ssimon# done in the build 1602160814Ssimon# 1603160814Ssimon# 1604160814Ssimon.align 4 1605160814Ssimon.bn_sub_words: 1606160814Ssimon# 1607160814Ssimon# Handcoded version of bn_sub_words 1608160814Ssimon# 1609160814Ssimon#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1610160814Ssimon# 1611160814Ssimon# r3 = r 1612160814Ssimon# r4 = a 1613160814Ssimon# r5 = b 1614160814Ssimon# r6 = n 1615160814Ssimon# 1616160814Ssimon# Note: No loop unrolling done since this is not a performance 1617160814Ssimon# critical loop. 1618160814Ssimon 1619160814Ssimon xor r0,r0,r0 #set r0 = 0 1620160814Ssimon# 1621160814Ssimon# check for r6 = 0 AND set carry bit. 1622160814Ssimon# 1623160814Ssimon subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1624160814Ssimon # if r6 > 0 then result !=0 1625160814Ssimon # In either case carry bit is set. 1626160814Ssimon bc BO_IF,CR0_EQ,Lppcasm_sub_adios 1627160814Ssimon addi r4,r4,-$BNSZ 1628160814Ssimon addi r3,r3,-$BNSZ 1629160814Ssimon addi r5,r5,-$BNSZ 1630160814Ssimon mtctr r6 1631160814SsimonLppcasm_sub_mainloop: 1632160814Ssimon $LDU r7,$BNSZ(r4) 1633160814Ssimon $LDU r8,$BNSZ(r5) 1634160814Ssimon subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 1635160814Ssimon # if carry = 1 this is r7-r8. Else it 1636160814Ssimon # is r7-r8 -1 as we need. 1637160814Ssimon $STU r6,$BNSZ(r3) 1638160814Ssimon bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop 1639160814SsimonLppcasm_sub_adios: 1640160814Ssimon subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1641160814Ssimon andi. r3,r3,1 # keep only last bit. 1642160814Ssimon bclr BO_ALWAYS,CR0_LT 1643160814Ssimon .long 0x00000000 1644160814Ssimon 1645160814Ssimon 1646160814Ssimon# 1647160814Ssimon# NOTE: The following label name should be changed to 1648160814Ssimon# "bn_add_words" i.e. remove the first dot 1649160814Ssimon# for the gcc compiler. This should be automatically 1650160814Ssimon# done in the build 1651160814Ssimon# 1652160814Ssimon 1653160814Ssimon.align 4 1654160814Ssimon.bn_add_words: 1655160814Ssimon# 1656160814Ssimon# Handcoded version of bn_add_words 1657160814Ssimon# 1658160814Ssimon#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1659160814Ssimon# 1660160814Ssimon# r3 = r 1661160814Ssimon# r4 = a 1662160814Ssimon# r5 = b 1663160814Ssimon# r6 = n 1664160814Ssimon# 1665160814Ssimon# Note: No loop unrolling done since this is not a performance 1666160814Ssimon# critical loop. 1667160814Ssimon 1668160814Ssimon xor r0,r0,r0 1669160814Ssimon# 1670160814Ssimon# check for r6 = 0. Is this needed? 1671160814Ssimon# 1672160814Ssimon addic. r6,r6,0 #test r6 and clear carry bit. 1673160814Ssimon bc BO_IF,CR0_EQ,Lppcasm_add_adios 1674160814Ssimon addi r4,r4,-$BNSZ 1675160814Ssimon addi r3,r3,-$BNSZ 1676160814Ssimon addi r5,r5,-$BNSZ 1677160814Ssimon mtctr r6 1678160814SsimonLppcasm_add_mainloop: 1679160814Ssimon $LDU r7,$BNSZ(r4) 1680160814Ssimon $LDU r8,$BNSZ(r5) 1681160814Ssimon adde r8,r7,r8 1682160814Ssimon $STU r8,$BNSZ(r3) 1683160814Ssimon bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop 1684160814SsimonLppcasm_add_adios: 1685160814Ssimon addze r3,r0 #return carry bit. 1686160814Ssimon bclr BO_ALWAYS,CR0_LT 1687160814Ssimon .long 0x00000000 1688160814Ssimon 1689160814Ssimon# 1690160814Ssimon# NOTE: The following label name should be changed to 1691160814Ssimon# "bn_div_words" i.e. remove the first dot 1692160814Ssimon# for the gcc compiler. This should be automatically 1693160814Ssimon# done in the build 1694160814Ssimon# 1695160814Ssimon 1696160814Ssimon.align 4 1697160814Ssimon.bn_div_words: 1698160814Ssimon# 1699160814Ssimon# This is a cleaned up version of code generated by 1700160814Ssimon# the AIX compiler. The only optimization is to use 1701160814Ssimon# the PPC instruction to count leading zeros instead 1702160814Ssimon# of call to num_bits_word. Since this was compiled 1703160814Ssimon# only at level -O2 we can possibly squeeze it more? 1704160814Ssimon# 1705160814Ssimon# r3 = h 1706160814Ssimon# r4 = l 1707160814Ssimon# r5 = d 1708160814Ssimon 1709160814Ssimon $UCMPI 0,r5,0 # compare r5 and 0 1710160814Ssimon bc BO_IF_NOT,CR0_EQ,Lppcasm_div1 # proceed if d!=0 1711160814Ssimon li r3,-1 # d=0 return -1 1712160814Ssimon bclr BO_ALWAYS,CR0_LT 1713160814SsimonLppcasm_div1: 1714160814Ssimon xor r0,r0,r0 #r0=0 1715160814Ssimon li r8,$BITS 1716160814Ssimon $CNTLZ. r7,r5 #r7 = num leading 0s in d. 1717160814Ssimon bc BO_IF,CR0_EQ,Lppcasm_div2 #proceed if no leading zeros 1718160814Ssimon subf r8,r7,r8 #r8 = BN_num_bits_word(d) 1719160814Ssimon $SHR. r9,r3,r8 #are there any bits above r8'th? 1720160814Ssimon $TR 16,r9,r0 #if there're, signal to dump core... 1721160814SsimonLppcasm_div2: 1722160814Ssimon $UCMP 0,r3,r5 #h>=d? 1723160814Ssimon bc BO_IF,CR0_LT,Lppcasm_div3 #goto Lppcasm_div3 if not 1724160814Ssimon subf r3,r5,r3 #h-=d ; 1725160814SsimonLppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1726160814Ssimon cmpi 0,0,r7,0 # is (i == 0)? 1727160814Ssimon bc BO_IF,CR0_EQ,Lppcasm_div4 1728160814Ssimon $SHL r3,r3,r7 # h = (h<< i) 1729160814Ssimon $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1730160814Ssimon $SHL r5,r5,r7 # d<<=i 1731160814Ssimon or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 1732160814Ssimon $SHL r4,r4,r7 # l <<=i 1733160814SsimonLppcasm_div4: 1734160814Ssimon $SHRI r9,r5,`$BITS/2` # r9 = dh 1735160814Ssimon # dl will be computed when needed 1736160814Ssimon # as it saves registers. 1737160814Ssimon li r6,2 #r6=2 1738160814Ssimon mtctr r6 #counter will be in count. 1739160814SsimonLppcasm_divouterloop: 1740160814Ssimon $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 1741160814Ssimon $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1742160814Ssimon # compute here for innerloop. 1743160814Ssimon $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1744160814Ssimon bc BO_IF_NOT,CR0_EQ,Lppcasm_div5 # goto Lppcasm_div5 if not 1745160814Ssimon 1746160814Ssimon li r8,-1 1747160814Ssimon $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1748160814Ssimon b Lppcasm_div6 1749160814SsimonLppcasm_div5: 1750160814Ssimon $UDIV r8,r3,r9 #q = h/dh 1751160814SsimonLppcasm_div6: 1752160814Ssimon $UMULL r12,r9,r8 #th = q*dh 1753160814Ssimon $CLRU r10,r5,`$BITS/2` #r10=dl 1754160814Ssimon $UMULL r6,r8,r10 #tl = q*dl 1755160814Ssimon 1756160814SsimonLppcasm_divinnerloop: 1757160814Ssimon subf r10,r12,r3 #t = h -th 1758160814Ssimon $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 1759160814Ssimon addic. r7,r7,0 #test if r7 == 0. used below. 1760160814Ssimon # now want to compute 1761160814Ssimon # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) 1762160814Ssimon # the following 2 instructions do that 1763160814Ssimon $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1764160814Ssimon or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1765160814Ssimon $UCMP 1,r6,r7 # compare (tl <= r7) 1766160814Ssimon bc BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit 1767160814Ssimon bc BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit 1768160814Ssimon addi r8,r8,-1 #q-- 1769160814Ssimon subf r12,r9,r12 #th -=dh 1770160814Ssimon $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1771160814Ssimon subf r6,r10,r6 #tl -=dl 1772160814Ssimon b Lppcasm_divinnerloop 1773160814SsimonLppcasm_divinnerexit: 1774160814Ssimon $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1775160814Ssimon $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1776160814Ssimon $UCMP 1,r4,r11 # compare l and tl 1777160814Ssimon add r12,r12,r10 # th+=t 1778160814Ssimon bc BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1779160814Ssimon addi r12,r12,1 # th++ 1780160814SsimonLppcasm_div7: 1781160814Ssimon subf r11,r11,r4 #r11=l-tl 1782160814Ssimon $UCMP 1,r3,r12 #compare h and th 1783160814Ssimon bc BO_IF_NOT,CR1_FX,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 1784160814Ssimon addi r8,r8,-1 # q-- 1785160814Ssimon add r3,r5,r3 # h+=d 1786160814SsimonLppcasm_div8: 1787160814Ssimon subf r12,r12,r3 #r12 = h-th 1788160814Ssimon $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 1789160814Ssimon # want to compute 1790160814Ssimon # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 1791160814Ssimon # the following 2 instructions will do this. 1792160814Ssimon $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 1793160814Ssimon $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1794160814Ssimon bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ; 1795160814Ssimon $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1796160814Ssimon b Lppcasm_divouterloop 1797160814SsimonLppcasm_div9: 1798160814Ssimon or r3,r8,r0 1799160814Ssimon bclr BO_ALWAYS,CR0_LT 1800160814Ssimon .long 0x00000000 1801160814Ssimon 1802160814Ssimon# 1803160814Ssimon# NOTE: The following label name should be changed to 1804160814Ssimon# "bn_sqr_words" i.e. remove the first dot 1805160814Ssimon# for the gcc compiler. This should be automatically 1806160814Ssimon# done in the build 1807160814Ssimon# 1808160814Ssimon.align 4 1809160814Ssimon.bn_sqr_words: 1810160814Ssimon# 1811160814Ssimon# Optimized version of bn_sqr_words 1812160814Ssimon# 1813160814Ssimon# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 1814160814Ssimon# 1815160814Ssimon# r3 = r 1816160814Ssimon# r4 = a 1817160814Ssimon# r5 = n 1818160814Ssimon# 1819160814Ssimon# r6 = a[i]. 1820160814Ssimon# r7,r8 = product. 1821160814Ssimon# 1822160814Ssimon# No unrolling done here. Not performance critical. 1823160814Ssimon 1824160814Ssimon addic. r5,r5,0 #test r5. 1825160814Ssimon bc BO_IF,CR0_EQ,Lppcasm_sqr_adios 1826160814Ssimon addi r4,r4,-$BNSZ 1827160814Ssimon addi r3,r3,-$BNSZ 1828160814Ssimon mtctr r5 1829160814SsimonLppcasm_sqr_mainloop: 1830160814Ssimon #sqr(r[0],r[1],a[0]); 1831160814Ssimon $LDU r6,$BNSZ(r4) 1832160814Ssimon $UMULL r7,r6,r6 1833160814Ssimon $UMULH r8,r6,r6 1834160814Ssimon $STU r7,$BNSZ(r3) 1835160814Ssimon $STU r8,$BNSZ(r3) 1836160814Ssimon bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop 1837160814SsimonLppcasm_sqr_adios: 1838160814Ssimon bclr BO_ALWAYS,CR0_LT 1839160814Ssimon .long 0x00000000 1840160814Ssimon 1841160814Ssimon 1842160814Ssimon# 1843160814Ssimon# NOTE: The following label name should be changed to 1844160814Ssimon# "bn_mul_words" i.e. remove the first dot 1845160814Ssimon# for the gcc compiler. This should be automatically 1846160814Ssimon# done in the build 1847160814Ssimon# 1848160814Ssimon 1849160814Ssimon.align 4 1850160814Ssimon.bn_mul_words: 1851160814Ssimon# 1852160814Ssimon# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1853160814Ssimon# 1854160814Ssimon# r3 = rp 1855160814Ssimon# r4 = ap 1856160814Ssimon# r5 = num 1857160814Ssimon# r6 = w 1858160814Ssimon xor r0,r0,r0 1859160814Ssimon xor r12,r12,r12 # used for carry 1860160814Ssimon rlwinm. r7,r5,30,2,31 # num >> 2 1861160814Ssimon bc BO_IF,CR0_EQ,Lppcasm_mw_REM 1862160814Ssimon mtctr r7 1863160814SsimonLppcasm_mw_LOOP: 1864160814Ssimon #mul(rp[0],ap[0],w,c1); 1865160814Ssimon $LD r8,`0*$BNSZ`(r4) 1866160814Ssimon $UMULL r9,r6,r8 1867160814Ssimon $UMULH r10,r6,r8 1868160814Ssimon addc r9,r9,r12 1869160814Ssimon #addze r10,r10 #carry is NOT ignored. 1870160814Ssimon #will be taken care of 1871160814Ssimon #in second spin below 1872160814Ssimon #using adde. 1873160814Ssimon $ST r9,`0*$BNSZ`(r3) 1874160814Ssimon #mul(rp[1],ap[1],w,c1); 1875160814Ssimon $LD r8,`1*$BNSZ`(r4) 1876160814Ssimon $UMULL r11,r6,r8 1877160814Ssimon $UMULH r12,r6,r8 1878160814Ssimon adde r11,r11,r10 1879160814Ssimon #addze r12,r12 1880160814Ssimon $ST r11,`1*$BNSZ`(r3) 1881160814Ssimon #mul(rp[2],ap[2],w,c1); 1882160814Ssimon $LD r8,`2*$BNSZ`(r4) 1883160814Ssimon $UMULL r9,r6,r8 1884160814Ssimon $UMULH r10,r6,r8 1885160814Ssimon adde r9,r9,r12 1886160814Ssimon #addze r10,r10 1887160814Ssimon $ST r9,`2*$BNSZ`(r3) 1888160814Ssimon #mul_add(rp[3],ap[3],w,c1); 1889160814Ssimon $LD r8,`3*$BNSZ`(r4) 1890160814Ssimon $UMULL r11,r6,r8 1891160814Ssimon $UMULH r12,r6,r8 1892160814Ssimon adde r11,r11,r10 1893160814Ssimon addze r12,r12 #this spin we collect carry into 1894160814Ssimon #r12 1895160814Ssimon $ST r11,`3*$BNSZ`(r3) 1896160814Ssimon 1897160814Ssimon addi r3,r3,`4*$BNSZ` 1898160814Ssimon addi r4,r4,`4*$BNSZ` 1899160814Ssimon bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP 1900160814Ssimon 1901160814SsimonLppcasm_mw_REM: 1902160814Ssimon andi. r5,r5,0x3 1903160814Ssimon bc BO_IF,CR0_EQ,Lppcasm_mw_OVER 1904160814Ssimon #mul(rp[0],ap[0],w,c1); 1905160814Ssimon $LD r8,`0*$BNSZ`(r4) 1906160814Ssimon $UMULL r9,r6,r8 1907160814Ssimon $UMULH r10,r6,r8 1908160814Ssimon addc r9,r9,r12 1909160814Ssimon addze r10,r10 1910160814Ssimon $ST r9,`0*$BNSZ`(r3) 1911160814Ssimon addi r12,r10,0 1912160814Ssimon 1913160814Ssimon addi r5,r5,-1 1914160814Ssimon cmpli 0,0,r5,0 1915160814Ssimon bc BO_IF,CR0_EQ,Lppcasm_mw_OVER 1916160814Ssimon 1917160814Ssimon 1918160814Ssimon #mul(rp[1],ap[1],w,c1); 1919160814Ssimon $LD r8,`1*$BNSZ`(r4) 1920160814Ssimon $UMULL r9,r6,r8 1921160814Ssimon $UMULH r10,r6,r8 1922160814Ssimon addc r9,r9,r12 1923160814Ssimon addze r10,r10 1924160814Ssimon $ST r9,`1*$BNSZ`(r3) 1925160814Ssimon addi r12,r10,0 1926160814Ssimon 1927160814Ssimon addi r5,r5,-1 1928160814Ssimon cmpli 0,0,r5,0 1929160814Ssimon bc BO_IF,CR0_EQ,Lppcasm_mw_OVER 1930160814Ssimon 1931160814Ssimon #mul_add(rp[2],ap[2],w,c1); 1932160814Ssimon $LD r8,`2*$BNSZ`(r4) 1933160814Ssimon $UMULL r9,r6,r8 1934160814Ssimon $UMULH r10,r6,r8 1935160814Ssimon addc r9,r9,r12 1936160814Ssimon addze r10,r10 1937160814Ssimon $ST r9,`2*$BNSZ`(r3) 1938160814Ssimon addi r12,r10,0 1939160814Ssimon 1940160814SsimonLppcasm_mw_OVER: 1941160814Ssimon addi r3,r12,0 1942160814Ssimon bclr BO_ALWAYS,CR0_LT 1943160814Ssimon .long 0x00000000 1944160814Ssimon 1945160814Ssimon# 1946160814Ssimon# NOTE: The following label name should be changed to 1947160814Ssimon# "bn_mul_add_words" i.e. remove the first dot 1948160814Ssimon# for the gcc compiler. This should be automatically 1949160814Ssimon# done in the build 1950160814Ssimon# 1951160814Ssimon 1952160814Ssimon.align 4 1953160814Ssimon.bn_mul_add_words: 1954160814Ssimon# 1955160814Ssimon# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1956160814Ssimon# 1957160814Ssimon# r3 = rp 1958160814Ssimon# r4 = ap 1959160814Ssimon# r5 = num 1960160814Ssimon# r6 = w 1961160814Ssimon# 1962160814Ssimon# empirical evidence suggests that unrolled version performs best!! 1963160814Ssimon# 1964160814Ssimon xor r0,r0,r0 #r0 = 0 1965160814Ssimon xor r12,r12,r12 #r12 = 0 . used for carry 1966160814Ssimon rlwinm. r7,r5,30,2,31 # num >> 2 1967160814Ssimon bc BO_IF,CR0_EQ,Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 1968160814Ssimon mtctr r7 1969160814SsimonLppcasm_maw_mainloop: 1970160814Ssimon #mul_add(rp[0],ap[0],w,c1); 1971160814Ssimon $LD r8,`0*$BNSZ`(r4) 1972160814Ssimon $LD r11,`0*$BNSZ`(r3) 1973160814Ssimon $UMULL r9,r6,r8 1974160814Ssimon $UMULH r10,r6,r8 1975160814Ssimon addc r9,r9,r12 #r12 is carry. 1976160814Ssimon addze r10,r10 1977160814Ssimon addc r9,r9,r11 1978160814Ssimon #addze r10,r10 1979160814Ssimon #the above instruction addze 1980160814Ssimon #is NOT needed. Carry will NOT 1981160814Ssimon #be ignored. It's not affected 1982160814Ssimon #by multiply and will be collected 1983160814Ssimon #in the next spin 1984160814Ssimon $ST r9,`0*$BNSZ`(r3) 1985160814Ssimon 1986160814Ssimon #mul_add(rp[1],ap[1],w,c1); 1987160814Ssimon $LD r8,`1*$BNSZ`(r4) 1988160814Ssimon $LD r9,`1*$BNSZ`(r3) 1989160814Ssimon $UMULL r11,r6,r8 1990160814Ssimon $UMULH r12,r6,r8 1991160814Ssimon adde r11,r11,r10 #r10 is carry. 1992160814Ssimon addze r12,r12 1993160814Ssimon addc r11,r11,r9 1994160814Ssimon #addze r12,r12 1995160814Ssimon $ST r11,`1*$BNSZ`(r3) 1996160814Ssimon 1997160814Ssimon #mul_add(rp[2],ap[2],w,c1); 1998160814Ssimon $LD r8,`2*$BNSZ`(r4) 1999160814Ssimon $UMULL r9,r6,r8 2000160814Ssimon $LD r11,`2*$BNSZ`(r3) 2001160814Ssimon $UMULH r10,r6,r8 2002160814Ssimon adde r9,r9,r12 2003160814Ssimon addze r10,r10 2004160814Ssimon addc r9,r9,r11 2005160814Ssimon #addze r10,r10 2006160814Ssimon $ST r9,`2*$BNSZ`(r3) 2007160814Ssimon 2008160814Ssimon #mul_add(rp[3],ap[3],w,c1); 2009160814Ssimon $LD r8,`3*$BNSZ`(r4) 2010160814Ssimon $UMULL r11,r6,r8 2011160814Ssimon $LD r9,`3*$BNSZ`(r3) 2012160814Ssimon $UMULH r12,r6,r8 2013160814Ssimon adde r11,r11,r10 2014160814Ssimon addze r12,r12 2015160814Ssimon addc r11,r11,r9 2016160814Ssimon addze r12,r12 2017160814Ssimon $ST r11,`3*$BNSZ`(r3) 2018160814Ssimon addi r3,r3,`4*$BNSZ` 2019160814Ssimon addi r4,r4,`4*$BNSZ` 2020160814Ssimon bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop 2021160814Ssimon 2022160814SsimonLppcasm_maw_leftover: 2023160814Ssimon andi. r5,r5,0x3 2024160814Ssimon bc BO_IF,CR0_EQ,Lppcasm_maw_adios 2025160814Ssimon addi r3,r3,-$BNSZ 2026160814Ssimon addi r4,r4,-$BNSZ 2027160814Ssimon #mul_add(rp[0],ap[0],w,c1); 2028160814Ssimon mtctr r5 2029160814Ssimon $LDU r8,$BNSZ(r4) 2030160814Ssimon $UMULL r9,r6,r8 2031160814Ssimon $UMULH r10,r6,r8 2032160814Ssimon $LDU r11,$BNSZ(r3) 2033160814Ssimon addc r9,r9,r11 2034160814Ssimon addze r10,r10 2035160814Ssimon addc r9,r9,r12 2036160814Ssimon addze r12,r10 2037160814Ssimon $ST r9,0(r3) 2038160814Ssimon 2039160814Ssimon bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios 2040160814Ssimon #mul_add(rp[1],ap[1],w,c1); 2041160814Ssimon $LDU r8,$BNSZ(r4) 2042160814Ssimon $UMULL r9,r6,r8 2043160814Ssimon $UMULH r10,r6,r8 2044160814Ssimon $LDU r11,$BNSZ(r3) 2045160814Ssimon addc r9,r9,r11 2046160814Ssimon addze r10,r10 2047160814Ssimon addc r9,r9,r12 2048160814Ssimon addze r12,r10 2049160814Ssimon $ST r9,0(r3) 2050160814Ssimon 2051160814Ssimon bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios 2052160814Ssimon #mul_add(rp[2],ap[2],w,c1); 2053160814Ssimon $LDU r8,$BNSZ(r4) 2054160814Ssimon $UMULL r9,r6,r8 2055160814Ssimon $UMULH r10,r6,r8 2056160814Ssimon $LDU r11,$BNSZ(r3) 2057160814Ssimon addc r9,r9,r11 2058160814Ssimon addze r10,r10 2059160814Ssimon addc r9,r9,r12 2060160814Ssimon addze r12,r10 2061160814Ssimon $ST r9,0(r3) 2062160814Ssimon 2063160814SsimonLppcasm_maw_adios: 2064160814Ssimon addi r3,r12,0 2065160814Ssimon bclr BO_ALWAYS,CR0_LT 2066160814Ssimon .long 0x00000000 2067160814Ssimon .align 4 2068160814SsimonEOF 2069160814Ssimon $data =~ s/\`([^\`]*)\`/eval $1/gem; 2070160814Ssimon 2071160814Ssimon # if some assembler chokes on some simplified mnemonic, 2072160814Ssimon # this is the spot to fix it up, e.g.: 2073160814Ssimon # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare 2074160814Ssimon $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm; 2075160814Ssimon # assembler X doesn't accept li, load immediate value 2076160814Ssimon #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm; 2077206046Ssimon # assembler Y chokes on apostrophes in comments 2078206046Ssimon $data =~ s/'//gm; 2079160814Ssimon return($data); 2080160814Ssimon} 2081