1238384Sjkim#!/usr/bin/env perl 2238384Sjkim# 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# This module doesn't present direct interest for OpenSSL, because it 11238384Sjkim# doesn't provide better performance for longer keys, at least not on 12238384Sjkim# in-order-execution cores. While 512-bit RSA sign operations can be 13238384Sjkim# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and 14238384Sjkim# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from 15238384Sjkim# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA 16238384Sjkim# verify:-( All comparisons are against bn_mul_mont-free assembler. 17238384Sjkim# The module might be of interest to embedded system developers, as 18238384Sjkim# the code is smaller than 1KB, yet offers >3x improvement on MIPS64 19238384Sjkim# and 75-30% [less for longer keys] on MIPS32 over compiler-generated 20238384Sjkim# code. 21238384Sjkim 22238384Sjkim###################################################################### 23238384Sjkim# There is a number of MIPS ABI in use, O32 and N32/64 are most 24238384Sjkim# widely used. Then there is a new contender: NUBI. It appears that if 25238384Sjkim# one picks the latter, it's possible to arrange code in ABI neutral 26238384Sjkim# manner. Therefore let's stick to NUBI register layout: 27238384Sjkim# 28238384Sjkim($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 29238384Sjkim($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 30238384Sjkim($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 31238384Sjkim($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 32238384Sjkim# 33238384Sjkim# The return value is placed in $a0. Following coding rules facilitate 34238384Sjkim# interoperability: 35238384Sjkim# 36238384Sjkim# - never ever touch $tp, "thread pointer", former $gp; 37238384Sjkim# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 38238384Sjkim# old code]; 39238384Sjkim# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 40238384Sjkim# 41238384Sjkim# For reference here is register layout for N32/64 MIPS ABIs: 42238384Sjkim# 43238384Sjkim# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 44238384Sjkim# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 45238384Sjkim# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 46238384Sjkim# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 47238384Sjkim# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 48238384Sjkim# 49238384Sjkim$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 50238384Sjkim 51238384Sjkimif ($flavour =~ /64|n32/i) { 52238384Sjkim $PTR_ADD="dadd"; # incidentally works even on n32 53238384Sjkim $PTR_SUB="dsub"; # incidentally works even on n32 54238384Sjkim $REG_S="sd"; 55238384Sjkim $REG_L="ld"; 56238384Sjkim $SZREG=8; 57238384Sjkim} else { 58238384Sjkim $PTR_ADD="add"; 59238384Sjkim $PTR_SUB="sub"; 60238384Sjkim $REG_S="sw"; 61238384Sjkim $REG_L="lw"; 62238384Sjkim $SZREG=4; 63238384Sjkim} 64238384Sjkim$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; 65238384Sjkim# 66238384Sjkim# <appro@openssl.org> 67238384Sjkim# 68238384Sjkim###################################################################### 69238384Sjkim 70238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 71238384Sjkimopen STDOUT,">$output"; 72238384Sjkim 73238384Sjkimif ($flavour =~ /64|n32/i) { 74238384Sjkim $LD="ld"; 75238384Sjkim $ST="sd"; 76238384Sjkim $MULTU="dmultu"; 77238384Sjkim $ADDU="daddu"; 78238384Sjkim $SUBU="dsubu"; 79238384Sjkim $BNSZ=8; 80238384Sjkim} else { 81238384Sjkim $LD="lw"; 82238384Sjkim $ST="sw"; 83238384Sjkim $MULTU="multu"; 84238384Sjkim $ADDU="addu"; 85238384Sjkim $SUBU="subu"; 86238384Sjkim $BNSZ=4; 87238384Sjkim} 88238384Sjkim 89238384Sjkim# int bn_mul_mont( 90238384Sjkim$rp=$a0; # BN_ULONG *rp, 91238384Sjkim$ap=$a1; # const BN_ULONG *ap, 92238384Sjkim$bp=$a2; # const BN_ULONG *bp, 93238384Sjkim$np=$a3; # const BN_ULONG *np, 94238384Sjkim$n0=$a4; # const BN_ULONG *n0, 95238384Sjkim$num=$a5; # int num); 96238384Sjkim 97238384Sjkim$lo0=$a6; 98238384Sjkim$hi0=$a7; 99238384Sjkim$lo1=$t1; 100238384Sjkim$hi1=$t2; 101238384Sjkim$aj=$s0; 102238384Sjkim$bi=$s1; 103238384Sjkim$nj=$s2; 104238384Sjkim$tp=$s3; 105238384Sjkim$alo=$s4; 106238384Sjkim$ahi=$s5; 107238384Sjkim$nlo=$s6; 108238384Sjkim$nhi=$s7; 109238384Sjkim$tj=$s8; 110238384Sjkim$i=$s9; 111238384Sjkim$j=$s10; 112238384Sjkim$m1=$s11; 113238384Sjkim 114238384Sjkim$FRAMESIZE=14; 115238384Sjkim 116238384Sjkim$code=<<___; 117238384Sjkim.text 118238384Sjkim 119238384Sjkim.set noat 120238384Sjkim.set noreorder 121238384Sjkim 122238384Sjkim.align 5 123238384Sjkim.globl bn_mul_mont 124238384Sjkim.ent bn_mul_mont 125238384Sjkimbn_mul_mont: 126238384Sjkim___ 127238384Sjkim$code.=<<___ if ($flavour =~ /o32/i); 128238384Sjkim lw $n0,16($sp) 129238384Sjkim lw $num,20($sp) 130238384Sjkim___ 131238384Sjkim$code.=<<___; 132238384Sjkim slt $at,$num,4 133238384Sjkim bnez $at,1f 134238384Sjkim li $t0,0 135238384Sjkim slt $at,$num,17 # on in-order CPU 136264331Sjkim bnez $at,bn_mul_mont_internal 137238384Sjkim nop 138238384Sjkim1: jr $ra 139238384Sjkim li $a0,0 140238384Sjkim.end bn_mul_mont 141238384Sjkim 142238384Sjkim.align 5 143238384Sjkim.ent bn_mul_mont_internal 144238384Sjkimbn_mul_mont_internal: 145238384Sjkim .frame $fp,$FRAMESIZE*$SZREG,$ra 146238384Sjkim .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG 147238384Sjkim $PTR_SUB $sp,$FRAMESIZE*$SZREG 148238384Sjkim $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) 149238384Sjkim $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) 150238384Sjkim $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) 151238384Sjkim $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) 152238384Sjkim $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) 153238384Sjkim $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) 154238384Sjkim $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) 155238384Sjkim $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) 156238384Sjkim $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) 157238384Sjkim___ 158238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 159238384Sjkim $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) 160238384Sjkim $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) 161238384Sjkim $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) 162238384Sjkim $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) 163238384Sjkim___ 164238384Sjkim$code.=<<___; 165238384Sjkim move $fp,$sp 166238384Sjkim 167238384Sjkim .set reorder 168238384Sjkim $LD $n0,0($n0) 169238384Sjkim $LD $bi,0($bp) # bp[0] 170238384Sjkim $LD $aj,0($ap) # ap[0] 171238384Sjkim $LD $nj,0($np) # np[0] 172238384Sjkim 173238384Sjkim $PTR_SUB $sp,2*$BNSZ # place for two extra words 174238384Sjkim sll $num,`log($BNSZ)/log(2)` 175238384Sjkim li $at,-4096 176238384Sjkim $PTR_SUB $sp,$num 177238384Sjkim and $sp,$at 178238384Sjkim 179238384Sjkim $MULTU $aj,$bi 180238384Sjkim $LD $alo,$BNSZ($ap) 181238384Sjkim $LD $nlo,$BNSZ($np) 182238384Sjkim mflo $lo0 183238384Sjkim mfhi $hi0 184238384Sjkim $MULTU $lo0,$n0 185238384Sjkim mflo $m1 186238384Sjkim 187238384Sjkim $MULTU $alo,$bi 188238384Sjkim mflo $alo 189238384Sjkim mfhi $ahi 190238384Sjkim 191238384Sjkim $MULTU $nj,$m1 192238384Sjkim mflo $lo1 193238384Sjkim mfhi $hi1 194238384Sjkim $MULTU $nlo,$m1 195238384Sjkim $ADDU $lo1,$lo0 196238384Sjkim sltu $at,$lo1,$lo0 197238384Sjkim $ADDU $hi1,$at 198238384Sjkim mflo $nlo 199238384Sjkim mfhi $nhi 200238384Sjkim 201238384Sjkim move $tp,$sp 202238384Sjkim li $j,2*$BNSZ 203238384Sjkim.align 4 204238384Sjkim.L1st: 205238384Sjkim .set noreorder 206238384Sjkim $PTR_ADD $aj,$ap,$j 207238384Sjkim $PTR_ADD $nj,$np,$j 208238384Sjkim $LD $aj,($aj) 209238384Sjkim $LD $nj,($nj) 210238384Sjkim 211238384Sjkim $MULTU $aj,$bi 212238384Sjkim $ADDU $lo0,$alo,$hi0 213238384Sjkim $ADDU $lo1,$nlo,$hi1 214238384Sjkim sltu $at,$lo0,$hi0 215238384Sjkim sltu $t0,$lo1,$hi1 216238384Sjkim $ADDU $hi0,$ahi,$at 217238384Sjkim $ADDU $hi1,$nhi,$t0 218238384Sjkim mflo $alo 219238384Sjkim mfhi $ahi 220238384Sjkim 221238384Sjkim $ADDU $lo1,$lo0 222238384Sjkim sltu $at,$lo1,$lo0 223238384Sjkim $MULTU $nj,$m1 224238384Sjkim $ADDU $hi1,$at 225238384Sjkim addu $j,$BNSZ 226238384Sjkim $ST $lo1,($tp) 227238384Sjkim sltu $t0,$j,$num 228238384Sjkim mflo $nlo 229238384Sjkim mfhi $nhi 230238384Sjkim 231238384Sjkim bnez $t0,.L1st 232238384Sjkim $PTR_ADD $tp,$BNSZ 233238384Sjkim .set reorder 234238384Sjkim 235238384Sjkim $ADDU $lo0,$alo,$hi0 236238384Sjkim sltu $at,$lo0,$hi0 237238384Sjkim $ADDU $hi0,$ahi,$at 238238384Sjkim 239238384Sjkim $ADDU $lo1,$nlo,$hi1 240238384Sjkim sltu $t0,$lo1,$hi1 241238384Sjkim $ADDU $hi1,$nhi,$t0 242238384Sjkim $ADDU $lo1,$lo0 243238384Sjkim sltu $at,$lo1,$lo0 244238384Sjkim $ADDU $hi1,$at 245238384Sjkim 246238384Sjkim $ST $lo1,($tp) 247238384Sjkim 248238384Sjkim $ADDU $hi1,$hi0 249238384Sjkim sltu $at,$hi1,$hi0 250238384Sjkim $ST $hi1,$BNSZ($tp) 251238384Sjkim $ST $at,2*$BNSZ($tp) 252238384Sjkim 253238384Sjkim li $i,$BNSZ 254238384Sjkim.align 4 255238384Sjkim.Louter: 256238384Sjkim $PTR_ADD $bi,$bp,$i 257238384Sjkim $LD $bi,($bi) 258238384Sjkim $LD $aj,($ap) 259238384Sjkim $LD $alo,$BNSZ($ap) 260238384Sjkim $LD $tj,($sp) 261238384Sjkim 262238384Sjkim $MULTU $aj,$bi 263238384Sjkim $LD $nj,($np) 264238384Sjkim $LD $nlo,$BNSZ($np) 265238384Sjkim mflo $lo0 266238384Sjkim mfhi $hi0 267238384Sjkim $ADDU $lo0,$tj 268238384Sjkim $MULTU $lo0,$n0 269238384Sjkim sltu $at,$lo0,$tj 270238384Sjkim $ADDU $hi0,$at 271238384Sjkim mflo $m1 272238384Sjkim 273238384Sjkim $MULTU $alo,$bi 274238384Sjkim mflo $alo 275238384Sjkim mfhi $ahi 276238384Sjkim 277238384Sjkim $MULTU $nj,$m1 278238384Sjkim mflo $lo1 279238384Sjkim mfhi $hi1 280238384Sjkim 281238384Sjkim $MULTU $nlo,$m1 282238384Sjkim $ADDU $lo1,$lo0 283238384Sjkim sltu $at,$lo1,$lo0 284238384Sjkim $ADDU $hi1,$at 285238384Sjkim mflo $nlo 286238384Sjkim mfhi $nhi 287238384Sjkim 288238384Sjkim move $tp,$sp 289238384Sjkim li $j,2*$BNSZ 290238384Sjkim $LD $tj,$BNSZ($tp) 291238384Sjkim.align 4 292238384Sjkim.Linner: 293238384Sjkim .set noreorder 294238384Sjkim $PTR_ADD $aj,$ap,$j 295238384Sjkim $PTR_ADD $nj,$np,$j 296238384Sjkim $LD $aj,($aj) 297238384Sjkim $LD $nj,($nj) 298238384Sjkim 299238384Sjkim $MULTU $aj,$bi 300238384Sjkim $ADDU $lo0,$alo,$hi0 301238384Sjkim $ADDU $lo1,$nlo,$hi1 302238384Sjkim sltu $at,$lo0,$hi0 303238384Sjkim sltu $t0,$lo1,$hi1 304238384Sjkim $ADDU $hi0,$ahi,$at 305238384Sjkim $ADDU $hi1,$nhi,$t0 306238384Sjkim mflo $alo 307238384Sjkim mfhi $ahi 308238384Sjkim 309238384Sjkim $ADDU $lo0,$tj 310238384Sjkim addu $j,$BNSZ 311238384Sjkim $MULTU $nj,$m1 312238384Sjkim sltu $at,$lo0,$tj 313238384Sjkim $ADDU $lo1,$lo0 314238384Sjkim $ADDU $hi0,$at 315238384Sjkim sltu $t0,$lo1,$lo0 316238384Sjkim $LD $tj,2*$BNSZ($tp) 317238384Sjkim $ADDU $hi1,$t0 318238384Sjkim sltu $at,$j,$num 319238384Sjkim mflo $nlo 320238384Sjkim mfhi $nhi 321238384Sjkim $ST $lo1,($tp) 322238384Sjkim bnez $at,.Linner 323238384Sjkim $PTR_ADD $tp,$BNSZ 324238384Sjkim .set reorder 325238384Sjkim 326238384Sjkim $ADDU $lo0,$alo,$hi0 327238384Sjkim sltu $at,$lo0,$hi0 328238384Sjkim $ADDU $hi0,$ahi,$at 329238384Sjkim $ADDU $lo0,$tj 330238384Sjkim sltu $t0,$lo0,$tj 331238384Sjkim $ADDU $hi0,$t0 332238384Sjkim 333238384Sjkim $LD $tj,2*$BNSZ($tp) 334238384Sjkim $ADDU $lo1,$nlo,$hi1 335238384Sjkim sltu $at,$lo1,$hi1 336238384Sjkim $ADDU $hi1,$nhi,$at 337238384Sjkim $ADDU $lo1,$lo0 338238384Sjkim sltu $t0,$lo1,$lo0 339238384Sjkim $ADDU $hi1,$t0 340238384Sjkim $ST $lo1,($tp) 341238384Sjkim 342238384Sjkim $ADDU $lo1,$hi1,$hi0 343238384Sjkim sltu $hi1,$lo1,$hi0 344238384Sjkim $ADDU $lo1,$tj 345238384Sjkim sltu $at,$lo1,$tj 346238384Sjkim $ADDU $hi1,$at 347238384Sjkim $ST $lo1,$BNSZ($tp) 348238384Sjkim $ST $hi1,2*$BNSZ($tp) 349238384Sjkim 350238384Sjkim addu $i,$BNSZ 351238384Sjkim sltu $t0,$i,$num 352238384Sjkim bnez $t0,.Louter 353238384Sjkim 354238384Sjkim .set noreorder 355238384Sjkim $PTR_ADD $tj,$sp,$num # &tp[num] 356238384Sjkim move $tp,$sp 357238384Sjkim move $ap,$sp 358238384Sjkim li $hi0,0 # clear borrow bit 359238384Sjkim 360238384Sjkim.align 4 361238384Sjkim.Lsub: $LD $lo0,($tp) 362238384Sjkim $LD $lo1,($np) 363238384Sjkim $PTR_ADD $tp,$BNSZ 364238384Sjkim $PTR_ADD $np,$BNSZ 365238384Sjkim $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] 366238384Sjkim sgtu $at,$lo1,$lo0 367238384Sjkim $SUBU $lo0,$lo1,$hi0 368238384Sjkim sgtu $hi0,$lo0,$lo1 369238384Sjkim $ST $lo0,($rp) 370238384Sjkim or $hi0,$at 371238384Sjkim sltu $at,$tp,$tj 372238384Sjkim bnez $at,.Lsub 373238384Sjkim $PTR_ADD $rp,$BNSZ 374238384Sjkim 375238384Sjkim $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit 376238384Sjkim move $tp,$sp 377238384Sjkim $PTR_SUB $rp,$num # restore rp 378238384Sjkim not $hi1,$hi0 379238384Sjkim 380238384Sjkim and $ap,$hi0,$sp 381238384Sjkim and $bp,$hi1,$rp 382238384Sjkim or $ap,$ap,$bp # ap=borrow?tp:rp 383238384Sjkim 384238384Sjkim.align 4 385238384Sjkim.Lcopy: $LD $aj,($ap) 386238384Sjkim $PTR_ADD $ap,$BNSZ 387238384Sjkim $ST $zero,($tp) 388238384Sjkim $PTR_ADD $tp,$BNSZ 389238384Sjkim sltu $at,$tp,$tj 390238384Sjkim $ST $aj,($rp) 391238384Sjkim bnez $at,.Lcopy 392238384Sjkim $PTR_ADD $rp,$BNSZ 393238384Sjkim 394238384Sjkim li $a0,1 395238384Sjkim li $t0,1 396238384Sjkim 397238384Sjkim .set noreorder 398238384Sjkim move $sp,$fp 399238384Sjkim $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) 400238384Sjkim $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) 401238384Sjkim $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) 402238384Sjkim $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) 403238384Sjkim $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) 404238384Sjkim $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) 405238384Sjkim $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) 406238384Sjkim $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) 407238384Sjkim $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) 408238384Sjkim___ 409238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 410238384Sjkim $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) 411238384Sjkim $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) 412238384Sjkim $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) 413238384Sjkim $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) 414238384Sjkim___ 415238384Sjkim$code.=<<___; 416238384Sjkim jr $ra 417238384Sjkim $PTR_ADD $sp,$FRAMESIZE*$SZREG 418238384Sjkim.end bn_mul_mont_internal 419238384Sjkim.rdata 420238384Sjkim.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" 421238384Sjkim___ 422238384Sjkim 423238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem; 424238384Sjkim 425238384Sjkimprint $code; 426238384Sjkimclose STDOUT; 427