mips.pl revision 264331
1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. 6# 7# Rights for redistribution and usage in source and binary forms are 8# granted according to the OpenSSL license. Warranty of any kind is 9# disclaimed. 10# ==================================================================== 11 12 13# July 1999 14# 15# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. 16# 17# The module is designed to work with either of the "new" MIPS ABI(5), 18# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under 19# IRIX 5.x not only because it doesn't support new ABIs but also 20# because 5.x kernels put R4x00 CPU into 32-bit mode and all those 21# 64-bit instructions (daddu, dmultu, etc.) found below gonna only 22# cause illegal instruction exception:-( 23# 24# In addition the code depends on preprocessor flags set up by MIPSpro 25# compiler driver (either as or cc) and therefore (probably?) can't be 26# compiled by the GNU assembler. GNU C driver manages fine though... 27# I mean as long as -mmips-as is specified or is the default option, 28# because then it simply invokes /usr/bin/as which in turn takes 29# perfect care of the preprocessor definitions. Another neat feature 30# offered by the MIPSpro assembler is an optimization pass. This gave 31# me the opportunity to have the code looking more regular as all those 32# architecture dependent instruction rescheduling details were left to 33# the assembler. Cool, huh? 34# 35# Performance improvement is astonishing! 'apps/openssl speed rsa dsa' 36# goes way over 3 times faster! 37# 38# <appro@fy.chalmers.se> 39 40# October 2010 41# 42# Adapt the module even for 32-bit ABIs and other OSes. The former was 43# achieved by mechanical replacement of 64-bit arithmetic instructions 44# such as dmultu, daddu, etc. with their 32-bit counterparts and 45# adjusting offsets denoting multiples of BN_ULONG. Above mentioned 46# >3x performance improvement naturally does not apply to 32-bit code 47# [because there is no instruction 32-bit compiler can't use], one 48# has to content with 40-85% improvement depending on benchmark and 49# key length, more for longer keys. 50 51$flavour = shift; 52while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 53open STDOUT,">$output"; 54 55if ($flavour =~ /64|n32/i) { 56 $LD="ld"; 57 $ST="sd"; 58 $MULTU="dmultu"; 59 $DIVU="ddivu"; 60 $ADDU="daddu"; 61 $SUBU="dsubu"; 62 $SRL="dsrl"; 63 $SLL="dsll"; 64 $BNSZ=8; 65 $PTR_ADD="daddu"; 66 $PTR_SUB="dsubu"; 67 $SZREG=8; 68 $REG_S="sd"; 69 $REG_L="ld"; 70} else { 71 $LD="lw"; 72 $ST="sw"; 73 $MULTU="multu"; 74 $DIVU="divu"; 75 $ADDU="addu"; 76 $SUBU="subu"; 77 $SRL="srl"; 78 $SLL="sll"; 79 $BNSZ=4; 80 $PTR_ADD="addu"; 81 $PTR_SUB="subu"; 82 $SZREG=4; 83 $REG_S="sw"; 84 $REG_L="lw"; 85 $code=".set mips2\n"; 86} 87 88# Below is N32/64 register layout used in the original module. 89# 90($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 91($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 92($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 93($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 94($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 95($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); 96# 97# No special adaptation is required for O32. NUBI on the other hand 98# is treated by saving/restoring ($v1,$t0..$t3). 99 100$gp=$v1 if ($flavour =~ /nubi/i); 101 102$minus4=$v1; 103 104$code.=<<___; 105.rdata 106.asciiz "mips3.s, Version 1.2" 107.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" 108 109.text 110.set noat 111 112.align 5 113.globl bn_mul_add_words 114.ent bn_mul_add_words 115bn_mul_add_words: 116 .set noreorder 117 bgtz $a2,bn_mul_add_words_internal 118 move $v0,$zero 119 jr $ra 120 move $a0,$v0 121.end bn_mul_add_words 122 123.align 5 124.ent bn_mul_add_words_internal 125bn_mul_add_words_internal: 126___ 127$code.=<<___ if ($flavour =~ /nubi/i); 128 .frame $sp,6*$SZREG,$ra 129 .mask 0x8000f008,-$SZREG 130 .set noreorder 131 $PTR_SUB $sp,6*$SZREG 132 $REG_S $ra,5*$SZREG($sp) 133 $REG_S $t3,4*$SZREG($sp) 134 $REG_S $t2,3*$SZREG($sp) 135 $REG_S $t1,2*$SZREG($sp) 136 $REG_S $t0,1*$SZREG($sp) 137 $REG_S $gp,0*$SZREG($sp) 138___ 139$code.=<<___; 140 .set reorder 141 li $minus4,-4 142 and $ta0,$a2,$minus4 143 beqz $ta0,.L_bn_mul_add_words_tail 144 145.L_bn_mul_add_words_loop: 146 $LD $t0,0($a1) 147 $MULTU $t0,$a3 148 $LD $t1,0($a0) 149 $LD $t2,$BNSZ($a1) 150 $LD $t3,$BNSZ($a0) 151 $LD $ta0,2*$BNSZ($a1) 152 $LD $ta1,2*$BNSZ($a0) 153 $ADDU $t1,$v0 154 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit 155 # values", but it seems to work fine 156 # even on 64-bit registers. 157 mflo $at 158 mfhi $t0 159 $ADDU $t1,$at 160 $ADDU $v0,$t0 161 $MULTU $t2,$a3 162 sltu $at,$t1,$at 163 $ST $t1,0($a0) 164 $ADDU $v0,$at 165 166 $LD $ta2,3*$BNSZ($a1) 167 $LD $ta3,3*$BNSZ($a0) 168 $ADDU $t3,$v0 169 sltu $v0,$t3,$v0 170 mflo $at 171 mfhi $t2 172 $ADDU $t3,$at 173 $ADDU $v0,$t2 174 $MULTU $ta0,$a3 175 sltu $at,$t3,$at 176 $ST $t3,$BNSZ($a0) 177 $ADDU $v0,$at 178 179 subu $a2,4 180 $PTR_ADD $a0,4*$BNSZ 181 $PTR_ADD $a1,4*$BNSZ 182 $ADDU $ta1,$v0 183 sltu $v0,$ta1,$v0 184 mflo $at 185 mfhi $ta0 186 $ADDU $ta1,$at 187 $ADDU $v0,$ta0 188 $MULTU $ta2,$a3 189 sltu $at,$ta1,$at 190 $ST $ta1,-2*$BNSZ($a0) 191 $ADDU $v0,$at 192 193 194 and $ta0,$a2,$minus4 195 $ADDU $ta3,$v0 196 sltu $v0,$ta3,$v0 197 mflo $at 198 mfhi $ta2 199 $ADDU $ta3,$at 200 $ADDU $v0,$ta2 201 sltu $at,$ta3,$at 202 $ST $ta3,-$BNSZ($a0) 203 .set noreorder 204 bgtz $ta0,.L_bn_mul_add_words_loop 205 $ADDU $v0,$at 206 207 beqz $a2,.L_bn_mul_add_words_return 208 nop 209 210.L_bn_mul_add_words_tail: 211 .set reorder 212 $LD $t0,0($a1) 213 $MULTU $t0,$a3 214 $LD $t1,0($a0) 215 subu $a2,1 216 $ADDU $t1,$v0 217 sltu $v0,$t1,$v0 218 mflo $at 219 mfhi $t0 220 $ADDU $t1,$at 221 $ADDU $v0,$t0 222 sltu $at,$t1,$at 223 $ST $t1,0($a0) 224 $ADDU $v0,$at 225 beqz $a2,.L_bn_mul_add_words_return 226 227 $LD $t0,$BNSZ($a1) 228 $MULTU $t0,$a3 229 $LD $t1,$BNSZ($a0) 230 subu $a2,1 231 $ADDU $t1,$v0 232 sltu $v0,$t1,$v0 233 mflo $at 234 mfhi $t0 235 $ADDU $t1,$at 236 $ADDU $v0,$t0 237 sltu $at,$t1,$at 238 $ST $t1,$BNSZ($a0) 239 $ADDU $v0,$at 240 beqz $a2,.L_bn_mul_add_words_return 241 242 $LD $t0,2*$BNSZ($a1) 243 $MULTU $t0,$a3 244 $LD $t1,2*$BNSZ($a0) 245 $ADDU $t1,$v0 246 sltu $v0,$t1,$v0 247 mflo $at 248 mfhi $t0 249 $ADDU $t1,$at 250 $ADDU $v0,$t0 251 sltu $at,$t1,$at 252 $ST $t1,2*$BNSZ($a0) 253 $ADDU $v0,$at 254 255.L_bn_mul_add_words_return: 256 .set noreorder 257___ 258$code.=<<___ if ($flavour =~ /nubi/i); 259 $REG_L $t3,4*$SZREG($sp) 260 $REG_L $t2,3*$SZREG($sp) 261 $REG_L $t1,2*$SZREG($sp) 262 $REG_L $t0,1*$SZREG($sp) 263 $REG_L $gp,0*$SZREG($sp) 264 $PTR_ADD $sp,6*$SZREG 265___ 266$code.=<<___; 267 jr $ra 268 move $a0,$v0 269.end bn_mul_add_words_internal 270 271.align 5 272.globl bn_mul_words 273.ent bn_mul_words 274bn_mul_words: 275 .set noreorder 276 bgtz $a2,bn_mul_words_internal 277 move $v0,$zero 278 jr $ra 279 move $a0,$v0 280.end bn_mul_words 281 282.align 5 283.ent bn_mul_words_internal 284bn_mul_words_internal: 285___ 286$code.=<<___ if ($flavour =~ /nubi/i); 287 .frame $sp,6*$SZREG,$ra 288 .mask 0x8000f008,-$SZREG 289 .set noreorder 290 $PTR_SUB $sp,6*$SZREG 291 $REG_S $ra,5*$SZREG($sp) 292 $REG_S $t3,4*$SZREG($sp) 293 $REG_S $t2,3*$SZREG($sp) 294 $REG_S $t1,2*$SZREG($sp) 295 $REG_S $t0,1*$SZREG($sp) 296 $REG_S $gp,0*$SZREG($sp) 297___ 298$code.=<<___; 299 .set reorder 300 li $minus4,-4 301 and $ta0,$a2,$minus4 302 beqz $ta0,.L_bn_mul_words_tail 303 304.L_bn_mul_words_loop: 305 $LD $t0,0($a1) 306 $MULTU $t0,$a3 307 $LD $t2,$BNSZ($a1) 308 $LD $ta0,2*$BNSZ($a1) 309 $LD $ta2,3*$BNSZ($a1) 310 mflo $at 311 mfhi $t0 312 $ADDU $v0,$at 313 sltu $t1,$v0,$at 314 $MULTU $t2,$a3 315 $ST $v0,0($a0) 316 $ADDU $v0,$t1,$t0 317 318 subu $a2,4 319 $PTR_ADD $a0,4*$BNSZ 320 $PTR_ADD $a1,4*$BNSZ 321 mflo $at 322 mfhi $t2 323 $ADDU $v0,$at 324 sltu $t3,$v0,$at 325 $MULTU $ta0,$a3 326 $ST $v0,-3*$BNSZ($a0) 327 $ADDU $v0,$t3,$t2 328 329 mflo $at 330 mfhi $ta0 331 $ADDU $v0,$at 332 sltu $ta1,$v0,$at 333 $MULTU $ta2,$a3 334 $ST $v0,-2*$BNSZ($a0) 335 $ADDU $v0,$ta1,$ta0 336 337 and $ta0,$a2,$minus4 338 mflo $at 339 mfhi $ta2 340 $ADDU $v0,$at 341 sltu $ta3,$v0,$at 342 $ST $v0,-$BNSZ($a0) 343 .set noreorder 344 bgtz $ta0,.L_bn_mul_words_loop 345 $ADDU $v0,$ta3,$ta2 346 347 beqz $a2,.L_bn_mul_words_return 348 nop 349 350.L_bn_mul_words_tail: 351 .set reorder 352 $LD $t0,0($a1) 353 $MULTU $t0,$a3 354 subu $a2,1 355 mflo $at 356 mfhi $t0 357 $ADDU $v0,$at 358 sltu $t1,$v0,$at 359 $ST $v0,0($a0) 360 $ADDU $v0,$t1,$t0 361 beqz $a2,.L_bn_mul_words_return 362 363 $LD $t0,$BNSZ($a1) 364 $MULTU $t0,$a3 365 subu $a2,1 366 mflo $at 367 mfhi $t0 368 $ADDU $v0,$at 369 sltu $t1,$v0,$at 370 $ST $v0,$BNSZ($a0) 371 $ADDU $v0,$t1,$t0 372 beqz $a2,.L_bn_mul_words_return 373 374 $LD $t0,2*$BNSZ($a1) 375 $MULTU $t0,$a3 376 mflo $at 377 mfhi $t0 378 $ADDU $v0,$at 379 sltu $t1,$v0,$at 380 $ST $v0,2*$BNSZ($a0) 381 $ADDU $v0,$t1,$t0 382 383.L_bn_mul_words_return: 384 .set noreorder 385___ 386$code.=<<___ if ($flavour =~ /nubi/i); 387 $REG_L $t3,4*$SZREG($sp) 388 $REG_L $t2,3*$SZREG($sp) 389 $REG_L $t1,2*$SZREG($sp) 390 $REG_L $t0,1*$SZREG($sp) 391 $REG_L $gp,0*$SZREG($sp) 392 $PTR_ADD $sp,6*$SZREG 393___ 394$code.=<<___; 395 jr $ra 396 move $a0,$v0 397.end bn_mul_words_internal 398 399.align 5 400.globl bn_sqr_words 401.ent bn_sqr_words 402bn_sqr_words: 403 .set noreorder 404 bgtz $a2,bn_sqr_words_internal 405 move $v0,$zero 406 jr $ra 407 move $a0,$v0 408.end bn_sqr_words 409 410.align 5 411.ent bn_sqr_words_internal 412bn_sqr_words_internal: 413___ 414$code.=<<___ if ($flavour =~ /nubi/i); 415 .frame $sp,6*$SZREG,$ra 416 .mask 0x8000f008,-$SZREG 417 .set noreorder 418 $PTR_SUB $sp,6*$SZREG 419 $REG_S $ra,5*$SZREG($sp) 420 $REG_S $t3,4*$SZREG($sp) 421 $REG_S $t2,3*$SZREG($sp) 422 $REG_S $t1,2*$SZREG($sp) 423 $REG_S $t0,1*$SZREG($sp) 424 $REG_S $gp,0*$SZREG($sp) 425___ 426$code.=<<___; 427 .set reorder 428 li $minus4,-4 429 and $ta0,$a2,$minus4 430 beqz $ta0,.L_bn_sqr_words_tail 431 432.L_bn_sqr_words_loop: 433 $LD $t0,0($a1) 434 $MULTU $t0,$t0 435 $LD $t2,$BNSZ($a1) 436 $LD $ta0,2*$BNSZ($a1) 437 $LD $ta2,3*$BNSZ($a1) 438 mflo $t1 439 mfhi $t0 440 $ST $t1,0($a0) 441 $ST $t0,$BNSZ($a0) 442 443 $MULTU $t2,$t2 444 subu $a2,4 445 $PTR_ADD $a0,8*$BNSZ 446 $PTR_ADD $a1,4*$BNSZ 447 mflo $t3 448 mfhi $t2 449 $ST $t3,-6*$BNSZ($a0) 450 $ST $t2,-5*$BNSZ($a0) 451 452 $MULTU $ta0,$ta0 453 mflo $ta1 454 mfhi $ta0 455 $ST $ta1,-4*$BNSZ($a0) 456 $ST $ta0,-3*$BNSZ($a0) 457 458 459 $MULTU $ta2,$ta2 460 and $ta0,$a2,$minus4 461 mflo $ta3 462 mfhi $ta2 463 $ST $ta3,-2*$BNSZ($a0) 464 465 .set noreorder 466 bgtz $ta0,.L_bn_sqr_words_loop 467 $ST $ta2,-$BNSZ($a0) 468 469 beqz $a2,.L_bn_sqr_words_return 470 nop 471 472.L_bn_sqr_words_tail: 473 .set reorder 474 $LD $t0,0($a1) 475 $MULTU $t0,$t0 476 subu $a2,1 477 mflo $t1 478 mfhi $t0 479 $ST $t1,0($a0) 480 $ST $t0,$BNSZ($a0) 481 beqz $a2,.L_bn_sqr_words_return 482 483 $LD $t0,$BNSZ($a1) 484 $MULTU $t0,$t0 485 subu $a2,1 486 mflo $t1 487 mfhi $t0 488 $ST $t1,2*$BNSZ($a0) 489 $ST $t0,3*$BNSZ($a0) 490 beqz $a2,.L_bn_sqr_words_return 491 492 $LD $t0,2*$BNSZ($a1) 493 $MULTU $t0,$t0 494 mflo $t1 495 mfhi $t0 496 $ST $t1,4*$BNSZ($a0) 497 $ST $t0,5*$BNSZ($a0) 498 499.L_bn_sqr_words_return: 500 .set noreorder 501___ 502$code.=<<___ if ($flavour =~ /nubi/i); 503 $REG_L $t3,4*$SZREG($sp) 504 $REG_L $t2,3*$SZREG($sp) 505 $REG_L $t1,2*$SZREG($sp) 506 $REG_L $t0,1*$SZREG($sp) 507 $REG_L $gp,0*$SZREG($sp) 508 $PTR_ADD $sp,6*$SZREG 509___ 510$code.=<<___; 511 jr $ra 512 move $a0,$v0 513 514.end bn_sqr_words_internal 515 516.align 5 517.globl bn_add_words 518.ent bn_add_words 519bn_add_words: 520 .set noreorder 521 bgtz $a3,bn_add_words_internal 522 move $v0,$zero 523 jr $ra 524 move $a0,$v0 525.end bn_add_words 526 527.align 5 528.ent bn_add_words_internal 529bn_add_words_internal: 530___ 531$code.=<<___ if ($flavour =~ /nubi/i); 532 .frame $sp,6*$SZREG,$ra 533 .mask 0x8000f008,-$SZREG 534 .set noreorder 535 $PTR_SUB $sp,6*$SZREG 536 $REG_S $ra,5*$SZREG($sp) 537 $REG_S $t3,4*$SZREG($sp) 538 $REG_S $t2,3*$SZREG($sp) 539 $REG_S $t1,2*$SZREG($sp) 540 $REG_S $t0,1*$SZREG($sp) 541 $REG_S $gp,0*$SZREG($sp) 542___ 543$code.=<<___; 544 .set reorder 545 li $minus4,-4 546 and $at,$a3,$minus4 547 beqz $at,.L_bn_add_words_tail 548 549.L_bn_add_words_loop: 550 $LD $t0,0($a1) 551 $LD $ta0,0($a2) 552 subu $a3,4 553 $LD $t1,$BNSZ($a1) 554 and $at,$a3,$minus4 555 $LD $t2,2*$BNSZ($a1) 556 $PTR_ADD $a2,4*$BNSZ 557 $LD $t3,3*$BNSZ($a1) 558 $PTR_ADD $a0,4*$BNSZ 559 $LD $ta1,-3*$BNSZ($a2) 560 $PTR_ADD $a1,4*$BNSZ 561 $LD $ta2,-2*$BNSZ($a2) 562 $LD $ta3,-$BNSZ($a2) 563 $ADDU $ta0,$t0 564 sltu $t8,$ta0,$t0 565 $ADDU $t0,$ta0,$v0 566 sltu $v0,$t0,$ta0 567 $ST $t0,-4*$BNSZ($a0) 568 $ADDU $v0,$t8 569 570 $ADDU $ta1,$t1 571 sltu $t9,$ta1,$t1 572 $ADDU $t1,$ta1,$v0 573 sltu $v0,$t1,$ta1 574 $ST $t1,-3*$BNSZ($a0) 575 $ADDU $v0,$t9 576 577 $ADDU $ta2,$t2 578 sltu $t8,$ta2,$t2 579 $ADDU $t2,$ta2,$v0 580 sltu $v0,$t2,$ta2 581 $ST $t2,-2*$BNSZ($a0) 582 $ADDU $v0,$t8 583 584 $ADDU $ta3,$t3 585 sltu $t9,$ta3,$t3 586 $ADDU $t3,$ta3,$v0 587 sltu $v0,$t3,$ta3 588 $ST $t3,-$BNSZ($a0) 589 590 .set noreorder 591 bgtz $at,.L_bn_add_words_loop 592 $ADDU $v0,$t9 593 594 beqz $a3,.L_bn_add_words_return 595 nop 596 597.L_bn_add_words_tail: 598 .set reorder 599 $LD $t0,0($a1) 600 $LD $ta0,0($a2) 601 $ADDU $ta0,$t0 602 subu $a3,1 603 sltu $t8,$ta0,$t0 604 $ADDU $t0,$ta0,$v0 605 sltu $v0,$t0,$ta0 606 $ST $t0,0($a0) 607 $ADDU $v0,$t8 608 beqz $a3,.L_bn_add_words_return 609 610 $LD $t1,$BNSZ($a1) 611 $LD $ta1,$BNSZ($a2) 612 $ADDU $ta1,$t1 613 subu $a3,1 614 sltu $t9,$ta1,$t1 615 $ADDU $t1,$ta1,$v0 616 sltu $v0,$t1,$ta1 617 $ST $t1,$BNSZ($a0) 618 $ADDU $v0,$t9 619 beqz $a3,.L_bn_add_words_return 620 621 $LD $t2,2*$BNSZ($a1) 622 $LD $ta2,2*$BNSZ($a2) 623 $ADDU $ta2,$t2 624 sltu $t8,$ta2,$t2 625 $ADDU $t2,$ta2,$v0 626 sltu $v0,$t2,$ta2 627 $ST $t2,2*$BNSZ($a0) 628 $ADDU $v0,$t8 629 630.L_bn_add_words_return: 631 .set noreorder 632___ 633$code.=<<___ if ($flavour =~ /nubi/i); 634 $REG_L $t3,4*$SZREG($sp) 635 $REG_L $t2,3*$SZREG($sp) 636 $REG_L $t1,2*$SZREG($sp) 637 $REG_L $t0,1*$SZREG($sp) 638 $REG_L $gp,0*$SZREG($sp) 639 $PTR_ADD $sp,6*$SZREG 640___ 641$code.=<<___; 642 jr $ra 643 move $a0,$v0 644 645.end bn_add_words_internal 646 647.align 5 648.globl bn_sub_words 649.ent bn_sub_words 650bn_sub_words: 651 .set noreorder 652 bgtz $a3,bn_sub_words_internal 653 move $v0,$zero 654 jr $ra 655 move $a0,$zero 656.end bn_sub_words 657 658.align 5 659.ent bn_sub_words_internal 660bn_sub_words_internal: 661___ 662$code.=<<___ if ($flavour =~ /nubi/i); 663 .frame $sp,6*$SZREG,$ra 664 .mask 0x8000f008,-$SZREG 665 .set noreorder 666 $PTR_SUB $sp,6*$SZREG 667 $REG_S $ra,5*$SZREG($sp) 668 $REG_S $t3,4*$SZREG($sp) 669 $REG_S $t2,3*$SZREG($sp) 670 $REG_S $t1,2*$SZREG($sp) 671 $REG_S $t0,1*$SZREG($sp) 672 $REG_S $gp,0*$SZREG($sp) 673___ 674$code.=<<___; 675 .set reorder 676 li $minus4,-4 677 and $at,$a3,$minus4 678 beqz $at,.L_bn_sub_words_tail 679 680.L_bn_sub_words_loop: 681 $LD $t0,0($a1) 682 $LD $ta0,0($a2) 683 subu $a3,4 684 $LD $t1,$BNSZ($a1) 685 and $at,$a3,$minus4 686 $LD $t2,2*$BNSZ($a1) 687 $PTR_ADD $a2,4*$BNSZ 688 $LD $t3,3*$BNSZ($a1) 689 $PTR_ADD $a0,4*$BNSZ 690 $LD $ta1,-3*$BNSZ($a2) 691 $PTR_ADD $a1,4*$BNSZ 692 $LD $ta2,-2*$BNSZ($a2) 693 $LD $ta3,-$BNSZ($a2) 694 sltu $t8,$t0,$ta0 695 $SUBU $ta0,$t0,$ta0 696 $SUBU $t0,$ta0,$v0 697 sgtu $v0,$t0,$ta0 698 $ST $t0,-4*$BNSZ($a0) 699 $ADDU $v0,$t8 700 701 sltu $t9,$t1,$ta1 702 $SUBU $ta1,$t1,$ta1 703 $SUBU $t1,$ta1,$v0 704 sgtu $v0,$t1,$ta1 705 $ST $t1,-3*$BNSZ($a0) 706 $ADDU $v0,$t9 707 708 709 sltu $t8,$t2,$ta2 710 $SUBU $ta2,$t2,$ta2 711 $SUBU $t2,$ta2,$v0 712 sgtu $v0,$t2,$ta2 713 $ST $t2,-2*$BNSZ($a0) 714 $ADDU $v0,$t8 715 716 sltu $t9,$t3,$ta3 717 $SUBU $ta3,$t3,$ta3 718 $SUBU $t3,$ta3,$v0 719 sgtu $v0,$t3,$ta3 720 $ST $t3,-$BNSZ($a0) 721 722 .set noreorder 723 bgtz $at,.L_bn_sub_words_loop 724 $ADDU $v0,$t9 725 726 beqz $a3,.L_bn_sub_words_return 727 nop 728 729.L_bn_sub_words_tail: 730 .set reorder 731 $LD $t0,0($a1) 732 $LD $ta0,0($a2) 733 subu $a3,1 734 sltu $t8,$t0,$ta0 735 $SUBU $ta0,$t0,$ta0 736 $SUBU $t0,$ta0,$v0 737 sgtu $v0,$t0,$ta0 738 $ST $t0,0($a0) 739 $ADDU $v0,$t8 740 beqz $a3,.L_bn_sub_words_return 741 742 $LD $t1,$BNSZ($a1) 743 subu $a3,1 744 $LD $ta1,$BNSZ($a2) 745 sltu $t9,$t1,$ta1 746 $SUBU $ta1,$t1,$ta1 747 $SUBU $t1,$ta1,$v0 748 sgtu $v0,$t1,$ta1 749 $ST $t1,$BNSZ($a0) 750 $ADDU $v0,$t9 751 beqz $a3,.L_bn_sub_words_return 752 753 $LD $t2,2*$BNSZ($a1) 754 $LD $ta2,2*$BNSZ($a2) 755 sltu $t8,$t2,$ta2 756 $SUBU $ta2,$t2,$ta2 757 $SUBU $t2,$ta2,$v0 758 sgtu $v0,$t2,$ta2 759 $ST $t2,2*$BNSZ($a0) 760 $ADDU $v0,$t8 761 762.L_bn_sub_words_return: 763 .set noreorder 764___ 765$code.=<<___ if ($flavour =~ /nubi/i); 766 $REG_L $t3,4*$SZREG($sp) 767 $REG_L $t2,3*$SZREG($sp) 768 $REG_L $t1,2*$SZREG($sp) 769 $REG_L $t0,1*$SZREG($sp) 770 $REG_L $gp,0*$SZREG($sp) 771 $PTR_ADD $sp,6*$SZREG 772___ 773$code.=<<___; 774 jr $ra 775 move $a0,$v0 776.end bn_sub_words_internal 777 778.align 5 779.globl bn_div_3_words 780.ent bn_div_3_words 781bn_div_3_words: 782 .set noreorder 783 move $a3,$a0 # we know that bn_div_words does not 784 # touch $a3, $ta2, $ta3 and preserves $a2 785 # so that we can save two arguments 786 # and return address in registers 787 # instead of stack:-) 788 789 $LD $a0,($a3) 790 move $ta2,$a1 791 bne $a0,$a2,bn_div_3_words_internal 792 $LD $a1,-$BNSZ($a3) 793 li $v0,-1 794 jr $ra 795 move $a0,$v0 796.end bn_div_3_words 797 798.align 5 799.ent bn_div_3_words_internal 800bn_div_3_words_internal: 801___ 802$code.=<<___ if ($flavour =~ /nubi/i); 803 .frame $sp,6*$SZREG,$ra 804 .mask 0x8000f008,-$SZREG 805 .set noreorder 806 $PTR_SUB $sp,6*$SZREG 807 $REG_S $ra,5*$SZREG($sp) 808 $REG_S $t3,4*$SZREG($sp) 809 $REG_S $t2,3*$SZREG($sp) 810 $REG_S $t1,2*$SZREG($sp) 811 $REG_S $t0,1*$SZREG($sp) 812 $REG_S $gp,0*$SZREG($sp) 813___ 814$code.=<<___; 815 .set reorder 816 move $ta3,$ra 817 bal bn_div_words_internal 818 move $ra,$ta3 819 $MULTU $ta2,$v0 820 $LD $t2,-2*$BNSZ($a3) 821 move $ta0,$zero 822 mfhi $t1 823 mflo $t0 824 sltu $t8,$t1,$a1 825.L_bn_div_3_words_inner_loop: 826 bnez $t8,.L_bn_div_3_words_inner_loop_done 827 sgeu $at,$t2,$t0 828 seq $t9,$t1,$a1 829 and $at,$t9 830 sltu $t3,$t0,$ta2 831 $ADDU $a1,$a2 832 $SUBU $t1,$t3 833 $SUBU $t0,$ta2 834 sltu $t8,$t1,$a1 835 sltu $ta0,$a1,$a2 836 or $t8,$ta0 837 .set noreorder 838 beqz $at,.L_bn_div_3_words_inner_loop 839 $SUBU $v0,1 840 $ADDU $v0,1 841 .set reorder 842.L_bn_div_3_words_inner_loop_done: 843 .set noreorder 844___ 845$code.=<<___ if ($flavour =~ /nubi/i); 846 $REG_L $t3,4*$SZREG($sp) 847 $REG_L $t2,3*$SZREG($sp) 848 $REG_L $t1,2*$SZREG($sp) 849 $REG_L $t0,1*$SZREG($sp) 850 $REG_L $gp,0*$SZREG($sp) 851 $PTR_ADD $sp,6*$SZREG 852___ 853$code.=<<___; 854 jr $ra 855 move $a0,$v0 856.end bn_div_3_words_internal 857 858.align 5 859.globl bn_div_words 860.ent bn_div_words 861bn_div_words: 862 .set noreorder 863 bnez $a2,bn_div_words_internal 864 li $v0,-1 # I would rather signal div-by-zero 865 # which can be done with 'break 7' 866 jr $ra 867 move $a0,$v0 868.end bn_div_words 869 870.align 5 871.ent bn_div_words_internal 872bn_div_words_internal: 873___ 874$code.=<<___ if ($flavour =~ /nubi/i); 875 .frame $sp,6*$SZREG,$ra 876 .mask 0x8000f008,-$SZREG 877 .set noreorder 878 $PTR_SUB $sp,6*$SZREG 879 $REG_S $ra,5*$SZREG($sp) 880 $REG_S $t3,4*$SZREG($sp) 881 $REG_S $t2,3*$SZREG($sp) 882 $REG_S $t1,2*$SZREG($sp) 883 $REG_S $t0,1*$SZREG($sp) 884 $REG_S $gp,0*$SZREG($sp) 885___ 886$code.=<<___; 887 move $v1,$zero 888 bltz $a2,.L_bn_div_words_body 889 move $t9,$v1 890 $SLL $a2,1 891 bgtz $a2,.-4 892 addu $t9,1 893 894 .set reorder 895 negu $t1,$t9 896 li $t2,-1 897 $SLL $t2,$t1 898 and $t2,$a0 899 $SRL $at,$a1,$t1 900 .set noreorder 901 beqz $t2,.+12 902 nop 903 break 6 # signal overflow 904 .set reorder 905 $SLL $a0,$t9 906 $SLL $a1,$t9 907 or $a0,$at 908___ 909$QT=$ta0; 910$HH=$ta1; 911$DH=$v1; 912$code.=<<___; 913.L_bn_div_words_body: 914 $SRL $DH,$a2,4*$BNSZ # bits 915 sgeu $at,$a0,$a2 916 .set noreorder 917 beqz $at,.+12 918 nop 919 $SUBU $a0,$a2 920 .set reorder 921 922 li $QT,-1 923 $SRL $HH,$a0,4*$BNSZ # bits 924 $SRL $QT,4*$BNSZ # q=0xffffffff 925 beq $DH,$HH,.L_bn_div_words_skip_div1 926 $DIVU $zero,$a0,$DH 927 mflo $QT 928.L_bn_div_words_skip_div1: 929 $MULTU $a2,$QT 930 $SLL $t3,$a0,4*$BNSZ # bits 931 $SRL $at,$a1,4*$BNSZ # bits 932 or $t3,$at 933 mflo $t0 934 mfhi $t1 935.L_bn_div_words_inner_loop1: 936 sltu $t2,$t3,$t0 937 seq $t8,$HH,$t1 938 sltu $at,$HH,$t1 939 and $t2,$t8 940 sltu $v0,$t0,$a2 941 or $at,$t2 942 .set noreorder 943 beqz $at,.L_bn_div_words_inner_loop1_done 944 $SUBU $t1,$v0 945 $SUBU $t0,$a2 946 b .L_bn_div_words_inner_loop1 947 $SUBU $QT,1 948 .set reorder 949.L_bn_div_words_inner_loop1_done: 950 951 $SLL $a1,4*$BNSZ # bits 952 $SUBU $a0,$t3,$t0 953 $SLL $v0,$QT,4*$BNSZ # bits 954 955 li $QT,-1 956 $SRL $HH,$a0,4*$BNSZ # bits 957 $SRL $QT,4*$BNSZ # q=0xffffffff 958 beq $DH,$HH,.L_bn_div_words_skip_div2 959 $DIVU $zero,$a0,$DH 960 mflo $QT 961.L_bn_div_words_skip_div2: 962 $MULTU $a2,$QT 963 $SLL $t3,$a0,4*$BNSZ # bits 964 $SRL $at,$a1,4*$BNSZ # bits 965 or $t3,$at 966 mflo $t0 967 mfhi $t1 968.L_bn_div_words_inner_loop2: 969 sltu $t2,$t3,$t0 970 seq $t8,$HH,$t1 971 sltu $at,$HH,$t1 972 and $t2,$t8 973 sltu $v1,$t0,$a2 974 or $at,$t2 975 .set noreorder 976 beqz $at,.L_bn_div_words_inner_loop2_done 977 $SUBU $t1,$v1 978 $SUBU $t0,$a2 979 b .L_bn_div_words_inner_loop2 980 $SUBU $QT,1 981 .set reorder 982.L_bn_div_words_inner_loop2_done: 983 984 $SUBU $a0,$t3,$t0 985 or $v0,$QT 986 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it 987 $SRL $a2,$t9 # restore $a2 988 989 .set noreorder 990 move $a1,$v1 991___ 992$code.=<<___ if ($flavour =~ /nubi/i); 993 $REG_L $t3,4*$SZREG($sp) 994 $REG_L $t2,3*$SZREG($sp) 995 $REG_L $t1,2*$SZREG($sp) 996 $REG_L $t0,1*$SZREG($sp) 997 $REG_L $gp,0*$SZREG($sp) 998 $PTR_ADD $sp,6*$SZREG 999___ 1000$code.=<<___; 1001 jr $ra 1002 move $a0,$v0 1003.end bn_div_words_internal 1004___ 1005undef $HH; undef $QT; undef $DH; 1006 1007($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); 1008($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); 1009 1010($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 1011($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 1012 1013($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); 1014 1015$code.=<<___; 1016 1017.align 5 1018.globl bn_mul_comba8 1019.ent bn_mul_comba8 1020bn_mul_comba8: 1021 .set noreorder 1022___ 1023$code.=<<___ if ($flavour =~ /nubi/i); 1024 .frame $sp,12*$SZREG,$ra 1025 .mask 0x803ff008,-$SZREG 1026 $PTR_SUB $sp,12*$SZREG 1027 $REG_S $ra,11*$SZREG($sp) 1028 $REG_S $s5,10*$SZREG($sp) 1029 $REG_S $s4,9*$SZREG($sp) 1030 $REG_S $s3,8*$SZREG($sp) 1031 $REG_S $s2,7*$SZREG($sp) 1032 $REG_S $s1,6*$SZREG($sp) 1033 $REG_S $s0,5*$SZREG($sp) 1034 $REG_S $t3,4*$SZREG($sp) 1035 $REG_S $t2,3*$SZREG($sp) 1036 $REG_S $t1,2*$SZREG($sp) 1037 $REG_S $t0,1*$SZREG($sp) 1038 $REG_S $gp,0*$SZREG($sp) 1039___ 1040$code.=<<___ if ($flavour !~ /nubi/i); 1041 .frame $sp,6*$SZREG,$ra 1042 .mask 0x003f0000,-$SZREG 1043 $PTR_SUB $sp,6*$SZREG 1044 $REG_S $s5,5*$SZREG($sp) 1045 $REG_S $s4,4*$SZREG($sp) 1046 $REG_S $s3,3*$SZREG($sp) 1047 $REG_S $s2,2*$SZREG($sp) 1048 $REG_S $s1,1*$SZREG($sp) 1049 $REG_S $s0,0*$SZREG($sp) 1050___ 1051$code.=<<___; 1052 1053 .set reorder 1054 $LD $a_0,0($a1) # If compiled with -mips3 option on 1055 # R5000 box assembler barks on this 1056 # 1ine with "should not have mult/div 1057 # as last instruction in bb (R10K 1058 # bug)" warning. If anybody out there 1059 # has a clue about how to circumvent 1060 # this do send me a note. 1061 # <appro\@fy.chalmers.se> 1062 1063 $LD $b_0,0($a2) 1064 $LD $a_1,$BNSZ($a1) 1065 $LD $a_2,2*$BNSZ($a1) 1066 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); 1067 $LD $a_3,3*$BNSZ($a1) 1068 $LD $b_1,$BNSZ($a2) 1069 $LD $b_2,2*$BNSZ($a2) 1070 $LD $b_3,3*$BNSZ($a2) 1071 mflo $c_1 1072 mfhi $c_2 1073 1074 $LD $a_4,4*$BNSZ($a1) 1075 $LD $a_5,5*$BNSZ($a1) 1076 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); 1077 $LD $a_6,6*$BNSZ($a1) 1078 $LD $a_7,7*$BNSZ($a1) 1079 $LD $b_4,4*$BNSZ($a2) 1080 $LD $b_5,5*$BNSZ($a2) 1081 mflo $t_1 1082 mfhi $t_2 1083 $ADDU $c_2,$t_1 1084 sltu $at,$c_2,$t_1 1085 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); 1086 $ADDU $c_3,$t_2,$at 1087 $LD $b_6,6*$BNSZ($a2) 1088 $LD $b_7,7*$BNSZ($a2) 1089 $ST $c_1,0($a0) # r[0]=c1; 1090 mflo $t_1 1091 mfhi $t_2 1092 $ADDU $c_2,$t_1 1093 sltu $at,$c_2,$t_1 1094 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); 1095 $ADDU $t_2,$at 1096 $ADDU $c_3,$t_2 1097 sltu $c_1,$c_3,$t_2 1098 $ST $c_2,$BNSZ($a0) # r[1]=c2; 1099 1100 mflo $t_1 1101 mfhi $t_2 1102 $ADDU $c_3,$t_1 1103 sltu $at,$c_3,$t_1 1104 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); 1105 $ADDU $t_2,$at 1106 $ADDU $c_1,$t_2 1107 mflo $t_1 1108 mfhi $t_2 1109 $ADDU $c_3,$t_1 1110 sltu $at,$c_3,$t_1 1111 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); 1112 $ADDU $t_2,$at 1113 $ADDU $c_1,$t_2 1114 sltu $c_2,$c_1,$t_2 1115 mflo $t_1 1116 mfhi $t_2 1117 $ADDU $c_3,$t_1 1118 sltu $at,$c_3,$t_1 1119 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); 1120 $ADDU $t_2,$at 1121 $ADDU $c_1,$t_2 1122 sltu $at,$c_1,$t_2 1123 $ADDU $c_2,$at 1124 $ST $c_3,2*$BNSZ($a0) # r[2]=c3; 1125 1126 mflo $t_1 1127 mfhi $t_2 1128 $ADDU $c_1,$t_1 1129 sltu $at,$c_1,$t_1 1130 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); 1131 $ADDU $t_2,$at 1132 $ADDU $c_2,$t_2 1133 sltu $c_3,$c_2,$t_2 1134 mflo $t_1 1135 mfhi $t_2 1136 $ADDU $c_1,$t_1 1137 sltu $at,$c_1,$t_1 1138 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); 1139 $ADDU $t_2,$at 1140 $ADDU $c_2,$t_2 1141 sltu $at,$c_2,$t_2 1142 $ADDU $c_3,$at 1143 mflo $t_1 1144 mfhi $t_2 1145 $ADDU $c_1,$t_1 1146 sltu $at,$c_1,$t_1 1147 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); 1148 $ADDU $t_2,$at 1149 $ADDU $c_2,$t_2 1150 sltu $at,$c_2,$t_2 1151 $ADDU $c_3,$at 1152 mflo $t_1 1153 mfhi $t_2 1154 $ADDU $c_1,$t_1 1155 sltu $at,$c_1,$t_1 1156 $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); 1157 $ADDU $t_2,$at 1158 $ADDU $c_2,$t_2 1159 sltu $at,$c_2,$t_2 1160 $ADDU $c_3,$at 1161 $ST $c_1,3*$BNSZ($a0) # r[3]=c1; 1162 1163 mflo $t_1 1164 mfhi $t_2 1165 $ADDU $c_2,$t_1 1166 sltu $at,$c_2,$t_1 1167 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); 1168 $ADDU $t_2,$at 1169 $ADDU $c_3,$t_2 1170 sltu $c_1,$c_3,$t_2 1171 mflo $t_1 1172 mfhi $t_2 1173 $ADDU $c_2,$t_1 1174 sltu $at,$c_2,$t_1 1175 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); 1176 $ADDU $t_2,$at 1177 $ADDU $c_3,$t_2 1178 sltu $at,$c_3,$t_2 1179 $ADDU $c_1,$at 1180 mflo $t_1 1181 mfhi $t_2 1182 $ADDU $c_2,$t_1 1183 sltu $at,$c_2,$t_1 1184 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); 1185 $ADDU $t_2,$at 1186 $ADDU $c_3,$t_2 1187 sltu $at,$c_3,$t_2 1188 $ADDU $c_1,$at 1189 mflo $t_1 1190 mfhi $t_2 1191 $ADDU $c_2,$t_1 1192 sltu $at,$c_2,$t_1 1193 $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); 1194 $ADDU $t_2,$at 1195 $ADDU $c_3,$t_2 1196 sltu $at,$c_3,$t_2 1197 $ADDU $c_1,$at 1198 mflo $t_1 1199 mfhi $t_2 1200 $ADDU $c_2,$t_1 1201 sltu $at,$c_2,$t_1 1202 $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); 1203 $ADDU $t_2,$at 1204 $ADDU $c_3,$t_2 1205 sltu $at,$c_3,$t_2 1206 $ADDU $c_1,$at 1207 $ST $c_2,4*$BNSZ($a0) # r[4]=c2; 1208 1209 mflo $t_1 1210 mfhi $t_2 1211 $ADDU $c_3,$t_1 1212 sltu $at,$c_3,$t_1 1213 $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); 1214 $ADDU $t_2,$at 1215 $ADDU $c_1,$t_2 1216 sltu $c_2,$c_1,$t_2 1217 mflo $t_1 1218 mfhi $t_2 1219 $ADDU $c_3,$t_1 1220 sltu $at,$c_3,$t_1 1221 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); 1222 $ADDU $t_2,$at 1223 $ADDU $c_1,$t_2 1224 sltu $at,$c_1,$t_2 1225 $ADDU $c_2,$at 1226 mflo $t_1 1227 mfhi $t_2 1228 $ADDU $c_3,$t_1 1229 sltu $at,$c_3,$t_1 1230 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); 1231 $ADDU $t_2,$at 1232 $ADDU $c_1,$t_2 1233 sltu $at,$c_1,$t_2 1234 $ADDU $c_2,$at 1235 mflo $t_1 1236 mfhi $t_2 1237 $ADDU $c_3,$t_1 1238 sltu $at,$c_3,$t_1 1239 $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); 1240 $ADDU $t_2,$at 1241 $ADDU $c_1,$t_2 1242 sltu $at,$c_1,$t_2 1243 $ADDU $c_2,$at 1244 mflo $t_1 1245 mfhi $t_2 1246 $ADDU $c_3,$t_1 1247 sltu $at,$c_3,$t_1 1248 $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); 1249 $ADDU $t_2,$at 1250 $ADDU $c_1,$t_2 1251 sltu $at,$c_1,$t_2 1252 $ADDU $c_2,$at 1253 mflo $t_1 1254 mfhi $t_2 1255 $ADDU $c_3,$t_1 1256 sltu $at,$c_3,$t_1 1257 $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); 1258 $ADDU $t_2,$at 1259 $ADDU $c_1,$t_2 1260 sltu $at,$c_1,$t_2 1261 $ADDU $c_2,$at 1262 $ST $c_3,5*$BNSZ($a0) # r[5]=c3; 1263 1264 mflo $t_1 1265 mfhi $t_2 1266 $ADDU $c_1,$t_1 1267 sltu $at,$c_1,$t_1 1268 $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); 1269 $ADDU $t_2,$at 1270 $ADDU $c_2,$t_2 1271 sltu $c_3,$c_2,$t_2 1272 mflo $t_1 1273 mfhi $t_2 1274 $ADDU $c_1,$t_1 1275 sltu $at,$c_1,$t_1 1276 $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); 1277 $ADDU $t_2,$at 1278 $ADDU $c_2,$t_2 1279 sltu $at,$c_2,$t_2 1280 $ADDU $c_3,$at 1281 mflo $t_1 1282 mfhi $t_2 1283 $ADDU $c_1,$t_1 1284 sltu $at,$c_1,$t_1 1285 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); 1286 $ADDU $t_2,$at 1287 $ADDU $c_2,$t_2 1288 sltu $at,$c_2,$t_2 1289 $ADDU $c_3,$at 1290 mflo $t_1 1291 mfhi $t_2 1292 $ADDU $c_1,$t_1 1293 sltu $at,$c_1,$t_1 1294 $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); 1295 $ADDU $t_2,$at 1296 $ADDU $c_2,$t_2 1297 sltu $at,$c_2,$t_2 1298 $ADDU $c_3,$at 1299 mflo $t_1 1300 mfhi $t_2 1301 $ADDU $c_1,$t_1 1302 sltu $at,$c_1,$t_1 1303 $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); 1304 $ADDU $t_2,$at 1305 $ADDU $c_2,$t_2 1306 sltu $at,$c_2,$t_2 1307 $ADDU $c_3,$at 1308 mflo $t_1 1309 mfhi $t_2 1310 $ADDU $c_1,$t_1 1311 sltu $at,$c_1,$t_1 1312 $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); 1313 $ADDU $t_2,$at 1314 $ADDU $c_2,$t_2 1315 sltu $at,$c_2,$t_2 1316 $ADDU $c_3,$at 1317 mflo $t_1 1318 mfhi $t_2 1319 $ADDU $c_1,$t_1 1320 sltu $at,$c_1,$t_1 1321 $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); 1322 $ADDU $t_2,$at 1323 $ADDU $c_2,$t_2 1324 sltu $at,$c_2,$t_2 1325 $ADDU $c_3,$at 1326 $ST $c_1,6*$BNSZ($a0) # r[6]=c1; 1327 1328 mflo $t_1 1329 mfhi $t_2 1330 $ADDU $c_2,$t_1 1331 sltu $at,$c_2,$t_1 1332 $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); 1333 $ADDU $t_2,$at 1334 $ADDU $c_3,$t_2 1335 sltu $c_1,$c_3,$t_2 1336 mflo $t_1 1337 mfhi $t_2 1338 $ADDU $c_2,$t_1 1339 sltu $at,$c_2,$t_1 1340 $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); 1341 $ADDU $t_2,$at 1342 $ADDU $c_3,$t_2 1343 sltu $at,$c_3,$t_2 1344 $ADDU $c_1,$at 1345 mflo $t_1 1346 mfhi $t_2 1347 $ADDU $c_2,$t_1 1348 sltu $at,$c_2,$t_1 1349 $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); 1350 $ADDU $t_2,$at 1351 $ADDU $c_3,$t_2 1352 sltu $at,$c_3,$t_2 1353 $ADDU $c_1,$at 1354 mflo $t_1 1355 mfhi $t_2 1356 $ADDU $c_2,$t_1 1357 sltu $at,$c_2,$t_1 1358 $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); 1359 $ADDU $t_2,$at 1360 $ADDU $c_3,$t_2 1361 sltu $at,$c_3,$t_2 1362 $ADDU $c_1,$at 1363 mflo $t_1 1364 mfhi $t_2 1365 $ADDU $c_2,$t_1 1366 sltu $at,$c_2,$t_1 1367 $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); 1368 $ADDU $t_2,$at 1369 $ADDU $c_3,$t_2 1370 sltu $at,$c_3,$t_2 1371 $ADDU $c_1,$at 1372 mflo $t_1 1373 mfhi $t_2 1374 $ADDU $c_2,$t_1 1375 sltu $at,$c_2,$t_1 1376 $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); 1377 $ADDU $t_2,$at 1378 $ADDU $c_3,$t_2 1379 sltu $at,$c_3,$t_2 1380 $ADDU $c_1,$at 1381 mflo $t_1 1382 mfhi $t_2 1383 $ADDU $c_2,$t_1 1384 sltu $at,$c_2,$t_1 1385 $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); 1386 $ADDU $t_2,$at 1387 $ADDU $c_3,$t_2 1388 sltu $at,$c_3,$t_2 1389 $ADDU $c_1,$at 1390 mflo $t_1 1391 mfhi $t_2 1392 $ADDU $c_2,$t_1 1393 sltu $at,$c_2,$t_1 1394 $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); 1395 $ADDU $t_2,$at 1396 $ADDU $c_3,$t_2 1397 sltu $at,$c_3,$t_2 1398 $ADDU $c_1,$at 1399 $ST $c_2,7*$BNSZ($a0) # r[7]=c2; 1400 1401 mflo $t_1 1402 mfhi $t_2 1403 $ADDU $c_3,$t_1 1404 sltu $at,$c_3,$t_1 1405 $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); 1406 $ADDU $t_2,$at 1407 $ADDU $c_1,$t_2 1408 sltu $c_2,$c_1,$t_2 1409 mflo $t_1 1410 mfhi $t_2 1411 $ADDU $c_3,$t_1 1412 sltu $at,$c_3,$t_1 1413 $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); 1414 $ADDU $t_2,$at 1415 $ADDU $c_1,$t_2 1416 sltu $at,$c_1,$t_2 1417 $ADDU $c_2,$at 1418 mflo $t_1 1419 mfhi $t_2 1420 $ADDU $c_3,$t_1 1421 sltu $at,$c_3,$t_1 1422 $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); 1423 $ADDU $t_2,$at 1424 $ADDU $c_1,$t_2 1425 sltu $at,$c_1,$t_2 1426 $ADDU $c_2,$at 1427 mflo $t_1 1428 mfhi $t_2 1429 $ADDU $c_3,$t_1 1430 sltu $at,$c_3,$t_1 1431 $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); 1432 $ADDU $t_2,$at 1433 $ADDU $c_1,$t_2 1434 sltu $at,$c_1,$t_2 1435 $ADDU $c_2,$at 1436 mflo $t_1 1437 mfhi $t_2 1438 $ADDU $c_3,$t_1 1439 sltu $at,$c_3,$t_1 1440 $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); 1441 $ADDU $t_2,$at 1442 $ADDU $c_1,$t_2 1443 sltu $at,$c_1,$t_2 1444 $ADDU $c_2,$at 1445 mflo $t_1 1446 mfhi $t_2 1447 $ADDU $c_3,$t_1 1448 sltu $at,$c_3,$t_1 1449 $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); 1450 $ADDU $t_2,$at 1451 $ADDU $c_1,$t_2 1452 sltu $at,$c_1,$t_2 1453 $ADDU $c_2,$at 1454 mflo $t_1 1455 mfhi $t_2 1456 $ADDU $c_3,$t_1 1457 sltu $at,$c_3,$t_1 1458 $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); 1459 $ADDU $t_2,$at 1460 $ADDU $c_1,$t_2 1461 sltu $at,$c_1,$t_2 1462 $ADDU $c_2,$at 1463 $ST $c_3,8*$BNSZ($a0) # r[8]=c3; 1464 1465 mflo $t_1 1466 mfhi $t_2 1467 $ADDU $c_1,$t_1 1468 sltu $at,$c_1,$t_1 1469 $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); 1470 $ADDU $t_2,$at 1471 $ADDU $c_2,$t_2 1472 sltu $c_3,$c_2,$t_2 1473 mflo $t_1 1474 mfhi $t_2 1475 $ADDU $c_1,$t_1 1476 sltu $at,$c_1,$t_1 1477 $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); 1478 $ADDU $t_2,$at 1479 $ADDU $c_2,$t_2 1480 sltu $at,$c_2,$t_2 1481 $ADDU $c_3,$at 1482 mflo $t_1 1483 mfhi $t_2 1484 $ADDU $c_1,$t_1 1485 sltu $at,$c_1,$t_1 1486 $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); 1487 $ADDU $t_2,$at 1488 $ADDU $c_2,$t_2 1489 sltu $at,$c_2,$t_2 1490 $ADDU $c_3,$at 1491 mflo $t_1 1492 mfhi $t_2 1493 $ADDU $c_1,$t_1 1494 sltu $at,$c_1,$t_1 1495 $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); 1496 $ADDU $t_2,$at 1497 $ADDU $c_2,$t_2 1498 sltu $at,$c_2,$t_2 1499 $ADDU $c_3,$at 1500 mflo $t_1 1501 mfhi $t_2 1502 $ADDU $c_1,$t_1 1503 sltu $at,$c_1,$t_1 1504 $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); 1505 $ADDU $t_2,$at 1506 $ADDU $c_2,$t_2 1507 sltu $at,$c_2,$t_2 1508 $ADDU $c_3,$at 1509 mflo $t_1 1510 mfhi $t_2 1511 $ADDU $c_1,$t_1 1512 sltu $at,$c_1,$t_1 1513 $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); 1514 $ADDU $t_2,$at 1515 $ADDU $c_2,$t_2 1516 sltu $at,$c_2,$t_2 1517 $ADDU $c_3,$at 1518 $ST $c_1,9*$BNSZ($a0) # r[9]=c1; 1519 1520 mflo $t_1 1521 mfhi $t_2 1522 $ADDU $c_2,$t_1 1523 sltu $at,$c_2,$t_1 1524 $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); 1525 $ADDU $t_2,$at 1526 $ADDU $c_3,$t_2 1527 sltu $c_1,$c_3,$t_2 1528 mflo $t_1 1529 mfhi $t_2 1530 $ADDU $c_2,$t_1 1531 sltu $at,$c_2,$t_1 1532 $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); 1533 $ADDU $t_2,$at 1534 $ADDU $c_3,$t_2 1535 sltu $at,$c_3,$t_2 1536 $ADDU $c_1,$at 1537 mflo $t_1 1538 mfhi $t_2 1539 $ADDU $c_2,$t_1 1540 sltu $at,$c_2,$t_1 1541 $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); 1542 $ADDU $t_2,$at 1543 $ADDU $c_3,$t_2 1544 sltu $at,$c_3,$t_2 1545 $ADDU $c_1,$at 1546 mflo $t_1 1547 mfhi $t_2 1548 $ADDU $c_2,$t_1 1549 sltu $at,$c_2,$t_1 1550 $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); 1551 $ADDU $t_2,$at 1552 $ADDU $c_3,$t_2 1553 sltu $at,$c_3,$t_2 1554 $ADDU $c_1,$at 1555 mflo $t_1 1556 mfhi $t_2 1557 $ADDU $c_2,$t_1 1558 sltu $at,$c_2,$t_1 1559 $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); 1560 $ADDU $t_2,$at 1561 $ADDU $c_3,$t_2 1562 sltu $at,$c_3,$t_2 1563 $ADDU $c_1,$at 1564 $ST $c_2,10*$BNSZ($a0) # r[10]=c2; 1565 1566 mflo $t_1 1567 mfhi $t_2 1568 $ADDU $c_3,$t_1 1569 sltu $at,$c_3,$t_1 1570 $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); 1571 $ADDU $t_2,$at 1572 $ADDU $c_1,$t_2 1573 sltu $c_2,$c_1,$t_2 1574 mflo $t_1 1575 mfhi $t_2 1576 $ADDU $c_3,$t_1 1577 sltu $at,$c_3,$t_1 1578 $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); 1579 $ADDU $t_2,$at 1580 $ADDU $c_1,$t_2 1581 sltu $at,$c_1,$t_2 1582 $ADDU $c_2,$at 1583 mflo $t_1 1584 mfhi $t_2 1585 $ADDU $c_3,$t_1 1586 sltu $at,$c_3,$t_1 1587 $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); 1588 $ADDU $t_2,$at 1589 $ADDU $c_1,$t_2 1590 sltu $at,$c_1,$t_2 1591 $ADDU $c_2,$at 1592 mflo $t_1 1593 mfhi $t_2 1594 $ADDU $c_3,$t_1 1595 sltu $at,$c_3,$t_1 1596 $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); 1597 $ADDU $t_2,$at 1598 $ADDU $c_1,$t_2 1599 sltu $at,$c_1,$t_2 1600 $ADDU $c_2,$at 1601 $ST $c_3,11*$BNSZ($a0) # r[11]=c3; 1602 1603 mflo $t_1 1604 mfhi $t_2 1605 $ADDU $c_1,$t_1 1606 sltu $at,$c_1,$t_1 1607 $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); 1608 $ADDU $t_2,$at 1609 $ADDU $c_2,$t_2 1610 sltu $c_3,$c_2,$t_2 1611 mflo $t_1 1612 mfhi $t_2 1613 $ADDU $c_1,$t_1 1614 sltu $at,$c_1,$t_1 1615 $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); 1616 $ADDU $t_2,$at 1617 $ADDU $c_2,$t_2 1618 sltu $at,$c_2,$t_2 1619 $ADDU $c_3,$at 1620 mflo $t_1 1621 mfhi $t_2 1622 $ADDU $c_1,$t_1 1623 sltu $at,$c_1,$t_1 1624 $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); 1625 $ADDU $t_2,$at 1626 $ADDU $c_2,$t_2 1627 sltu $at,$c_2,$t_2 1628 $ADDU $c_3,$at 1629 $ST $c_1,12*$BNSZ($a0) # r[12]=c1; 1630 1631 mflo $t_1 1632 mfhi $t_2 1633 $ADDU $c_2,$t_1 1634 sltu $at,$c_2,$t_1 1635 $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); 1636 $ADDU $t_2,$at 1637 $ADDU $c_3,$t_2 1638 sltu $c_1,$c_3,$t_2 1639 mflo $t_1 1640 mfhi $t_2 1641 $ADDU $c_2,$t_1 1642 sltu $at,$c_2,$t_1 1643 $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); 1644 $ADDU $t_2,$at 1645 $ADDU $c_3,$t_2 1646 sltu $at,$c_3,$t_2 1647 $ADDU $c_1,$at 1648 $ST $c_2,13*$BNSZ($a0) # r[13]=c2; 1649 1650 mflo $t_1 1651 mfhi $t_2 1652 $ADDU $c_3,$t_1 1653 sltu $at,$c_3,$t_1 1654 $ADDU $t_2,$at 1655 $ADDU $c_1,$t_2 1656 $ST $c_3,14*$BNSZ($a0) # r[14]=c3; 1657 $ST $c_1,15*$BNSZ($a0) # r[15]=c1; 1658 1659 .set noreorder 1660___ 1661$code.=<<___ if ($flavour =~ /nubi/i); 1662 $REG_L $s5,10*$SZREG($sp) 1663 $REG_L $s4,9*$SZREG($sp) 1664 $REG_L $s3,8*$SZREG($sp) 1665 $REG_L $s2,7*$SZREG($sp) 1666 $REG_L $s1,6*$SZREG($sp) 1667 $REG_L $s0,5*$SZREG($sp) 1668 $REG_L $t3,4*$SZREG($sp) 1669 $REG_L $t2,3*$SZREG($sp) 1670 $REG_L $t1,2*$SZREG($sp) 1671 $REG_L $t0,1*$SZREG($sp) 1672 $REG_L $gp,0*$SZREG($sp) 1673 jr $ra 1674 $PTR_ADD $sp,12*$SZREG 1675___ 1676$code.=<<___ if ($flavour !~ /nubi/i); 1677 $REG_L $s5,5*$SZREG($sp) 1678 $REG_L $s4,4*$SZREG($sp) 1679 $REG_L $s3,3*$SZREG($sp) 1680 $REG_L $s2,2*$SZREG($sp) 1681 $REG_L $s1,1*$SZREG($sp) 1682 $REG_L $s0,0*$SZREG($sp) 1683 jr $ra 1684 $PTR_ADD $sp,6*$SZREG 1685___ 1686$code.=<<___; 1687.end bn_mul_comba8 1688 1689.align 5 1690.globl bn_mul_comba4 1691.ent bn_mul_comba4 1692bn_mul_comba4: 1693___ 1694$code.=<<___ if ($flavour =~ /nubi/i); 1695 .frame $sp,6*$SZREG,$ra 1696 .mask 0x8000f008,-$SZREG 1697 .set noreorder 1698 $PTR_SUB $sp,6*$SZREG 1699 $REG_S $ra,5*$SZREG($sp) 1700 $REG_S $t3,4*$SZREG($sp) 1701 $REG_S $t2,3*$SZREG($sp) 1702 $REG_S $t1,2*$SZREG($sp) 1703 $REG_S $t0,1*$SZREG($sp) 1704 $REG_S $gp,0*$SZREG($sp) 1705___ 1706$code.=<<___; 1707 .set reorder 1708 $LD $a_0,0($a1) 1709 $LD $b_0,0($a2) 1710 $LD $a_1,$BNSZ($a1) 1711 $LD $a_2,2*$BNSZ($a1) 1712 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); 1713 $LD $a_3,3*$BNSZ($a1) 1714 $LD $b_1,$BNSZ($a2) 1715 $LD $b_2,2*$BNSZ($a2) 1716 $LD $b_3,3*$BNSZ($a2) 1717 mflo $c_1 1718 mfhi $c_2 1719 $ST $c_1,0($a0) 1720 1721 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); 1722 mflo $t_1 1723 mfhi $t_2 1724 $ADDU $c_2,$t_1 1725 sltu $at,$c_2,$t_1 1726 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); 1727 $ADDU $c_3,$t_2,$at 1728 mflo $t_1 1729 mfhi $t_2 1730 $ADDU $c_2,$t_1 1731 sltu $at,$c_2,$t_1 1732 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); 1733 $ADDU $t_2,$at 1734 $ADDU $c_3,$t_2 1735 sltu $c_1,$c_3,$t_2 1736 $ST $c_2,$BNSZ($a0) 1737 1738 mflo $t_1 1739 mfhi $t_2 1740 $ADDU $c_3,$t_1 1741 sltu $at,$c_3,$t_1 1742 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); 1743 $ADDU $t_2,$at 1744 $ADDU $c_1,$t_2 1745 mflo $t_1 1746 mfhi $t_2 1747 $ADDU $c_3,$t_1 1748 sltu $at,$c_3,$t_1 1749 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); 1750 $ADDU $t_2,$at 1751 $ADDU $c_1,$t_2 1752 sltu $c_2,$c_1,$t_2 1753 mflo $t_1 1754 mfhi $t_2 1755 $ADDU $c_3,$t_1 1756 sltu $at,$c_3,$t_1 1757 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); 1758 $ADDU $t_2,$at 1759 $ADDU $c_1,$t_2 1760 sltu $at,$c_1,$t_2 1761 $ADDU $c_2,$at 1762 $ST $c_3,2*$BNSZ($a0) 1763 1764 mflo $t_1 1765 mfhi $t_2 1766 $ADDU $c_1,$t_1 1767 sltu $at,$c_1,$t_1 1768 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); 1769 $ADDU $t_2,$at 1770 $ADDU $c_2,$t_2 1771 sltu $c_3,$c_2,$t_2 1772 mflo $t_1 1773 mfhi $t_2 1774 $ADDU $c_1,$t_1 1775 sltu $at,$c_1,$t_1 1776 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); 1777 $ADDU $t_2,$at 1778 $ADDU $c_2,$t_2 1779 sltu $at,$c_2,$t_2 1780 $ADDU $c_3,$at 1781 mflo $t_1 1782 mfhi $t_2 1783 $ADDU $c_1,$t_1 1784 sltu $at,$c_1,$t_1 1785 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); 1786 $ADDU $t_2,$at 1787 $ADDU $c_2,$t_2 1788 sltu $at,$c_2,$t_2 1789 $ADDU $c_3,$at 1790 mflo $t_1 1791 mfhi $t_2 1792 $ADDU $c_1,$t_1 1793 sltu $at,$c_1,$t_1 1794 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); 1795 $ADDU $t_2,$at 1796 $ADDU $c_2,$t_2 1797 sltu $at,$c_2,$t_2 1798 $ADDU $c_3,$at 1799 $ST $c_1,3*$BNSZ($a0) 1800 1801 mflo $t_1 1802 mfhi $t_2 1803 $ADDU $c_2,$t_1 1804 sltu $at,$c_2,$t_1 1805 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); 1806 $ADDU $t_2,$at 1807 $ADDU $c_3,$t_2 1808 sltu $c_1,$c_3,$t_2 1809 mflo $t_1 1810 mfhi $t_2 1811 $ADDU $c_2,$t_1 1812 sltu $at,$c_2,$t_1 1813 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); 1814 $ADDU $t_2,$at 1815 $ADDU $c_3,$t_2 1816 sltu $at,$c_3,$t_2 1817 $ADDU $c_1,$at 1818 mflo $t_1 1819 mfhi $t_2 1820 $ADDU $c_2,$t_1 1821 sltu $at,$c_2,$t_1 1822 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); 1823 $ADDU $t_2,$at 1824 $ADDU $c_3,$t_2 1825 sltu $at,$c_3,$t_2 1826 $ADDU $c_1,$at 1827 $ST $c_2,4*$BNSZ($a0) 1828 1829 mflo $t_1 1830 mfhi $t_2 1831 $ADDU $c_3,$t_1 1832 sltu $at,$c_3,$t_1 1833 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); 1834 $ADDU $t_2,$at 1835 $ADDU $c_1,$t_2 1836 sltu $c_2,$c_1,$t_2 1837 mflo $t_1 1838 mfhi $t_2 1839 $ADDU $c_3,$t_1 1840 sltu $at,$c_3,$t_1 1841 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); 1842 $ADDU $t_2,$at 1843 $ADDU $c_1,$t_2 1844 sltu $at,$c_1,$t_2 1845 $ADDU $c_2,$at 1846 $ST $c_3,5*$BNSZ($a0) 1847 1848 mflo $t_1 1849 mfhi $t_2 1850 $ADDU $c_1,$t_1 1851 sltu $at,$c_1,$t_1 1852 $ADDU $t_2,$at 1853 $ADDU $c_2,$t_2 1854 $ST $c_1,6*$BNSZ($a0) 1855 $ST $c_2,7*$BNSZ($a0) 1856 1857 .set noreorder 1858___ 1859$code.=<<___ if ($flavour =~ /nubi/i); 1860 $REG_L $t3,4*$SZREG($sp) 1861 $REG_L $t2,3*$SZREG($sp) 1862 $REG_L $t1,2*$SZREG($sp) 1863 $REG_L $t0,1*$SZREG($sp) 1864 $REG_L $gp,0*$SZREG($sp) 1865 $PTR_ADD $sp,6*$SZREG 1866___ 1867$code.=<<___; 1868 jr $ra 1869 nop 1870.end bn_mul_comba4 1871___ 1872 1873($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); 1874 1875$code.=<<___; 1876 1877.align 5 1878.globl bn_sqr_comba8 1879.ent bn_sqr_comba8 1880bn_sqr_comba8: 1881___ 1882$code.=<<___ if ($flavour =~ /nubi/i); 1883 .frame $sp,6*$SZREG,$ra 1884 .mask 0x8000f008,-$SZREG 1885 .set noreorder 1886 $PTR_SUB $sp,6*$SZREG 1887 $REG_S $ra,5*$SZREG($sp) 1888 $REG_S $t3,4*$SZREG($sp) 1889 $REG_S $t2,3*$SZREG($sp) 1890 $REG_S $t1,2*$SZREG($sp) 1891 $REG_S $t0,1*$SZREG($sp) 1892 $REG_S $gp,0*$SZREG($sp) 1893___ 1894$code.=<<___; 1895 .set reorder 1896 $LD $a_0,0($a1) 1897 $LD $a_1,$BNSZ($a1) 1898 $LD $a_2,2*$BNSZ($a1) 1899 $LD $a_3,3*$BNSZ($a1) 1900 1901 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); 1902 $LD $a_4,4*$BNSZ($a1) 1903 $LD $a_5,5*$BNSZ($a1) 1904 $LD $a_6,6*$BNSZ($a1) 1905 $LD $a_7,7*$BNSZ($a1) 1906 mflo $c_1 1907 mfhi $c_2 1908 $ST $c_1,0($a0) 1909 1910 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); 1911 mflo $t_1 1912 mfhi $t_2 1913 slt $c_1,$t_2,$zero 1914 $SLL $t_2,1 1915 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); 1916 slt $a2,$t_1,$zero 1917 $ADDU $t_2,$a2 1918 $SLL $t_1,1 1919 $ADDU $c_2,$t_1 1920 sltu $at,$c_2,$t_1 1921 $ADDU $c_3,$t_2,$at 1922 $ST $c_2,$BNSZ($a0) 1923 1924 mflo $t_1 1925 mfhi $t_2 1926 slt $c_2,$t_2,$zero 1927 $SLL $t_2,1 1928 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); 1929 slt $a2,$t_1,$zero 1930 $ADDU $t_2,$a2 1931 $SLL $t_1,1 1932 $ADDU $c_3,$t_1 1933 sltu $at,$c_3,$t_1 1934 $ADDU $t_2,$at 1935 $ADDU $c_1,$t_2 1936 sltu $at,$c_1,$t_2 1937 $ADDU $c_2,$at 1938 mflo $t_1 1939 mfhi $t_2 1940 $ADDU $c_3,$t_1 1941 sltu $at,$c_3,$t_1 1942 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); 1943 $ADDU $t_2,$at 1944 $ADDU $c_1,$t_2 1945 sltu $at,$c_1,$t_2 1946 $ADDU $c_2,$at 1947 $ST $c_3,2*$BNSZ($a0) 1948 1949 mflo $t_1 1950 mfhi $t_2 1951 slt $c_3,$t_2,$zero 1952 $SLL $t_2,1 1953 $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3); 1954 slt $a2,$t_1,$zero 1955 $ADDU $t_2,$a2 1956 $SLL $t_1,1 1957 $ADDU $c_1,$t_1 1958 sltu $at,$c_1,$t_1 1959 $ADDU $t_2,$at 1960 $ADDU $c_2,$t_2 1961 sltu $at,$c_2,$t_2 1962 $ADDU $c_3,$at 1963 mflo $t_1 1964 mfhi $t_2 1965 slt $at,$t_2,$zero 1966 $ADDU $c_3,$at 1967 $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1); 1968 $SLL $t_2,1 1969 slt $a2,$t_1,$zero 1970 $ADDU $t_2,$a2 1971 $SLL $t_1,1 1972 $ADDU $c_1,$t_1 1973 sltu $at,$c_1,$t_1 1974 $ADDU $t_2,$at 1975 $ADDU $c_2,$t_2 1976 sltu $at,$c_2,$t_2 1977 $ADDU $c_3,$at 1978 $ST $c_1,3*$BNSZ($a0) 1979 1980 mflo $t_1 1981 mfhi $t_2 1982 slt $c_1,$t_2,$zero 1983 $SLL $t_2,1 1984 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); 1985 slt $a2,$t_1,$zero 1986 $ADDU $t_2,$a2 1987 $SLL $t_1,1 1988 $ADDU $c_2,$t_1 1989 sltu $at,$c_2,$t_1 1990 $ADDU $t_2,$at 1991 $ADDU $c_3,$t_2 1992 sltu $at,$c_3,$t_2 1993 $ADDU $c_1,$at 1994 mflo $t_1 1995 mfhi $t_2 1996 slt $at,$t_2,$zero 1997 $ADDU $c_1,$at 1998 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); 1999 $SLL $t_2,1 2000 slt $a2,$t_1,$zero 2001 $ADDU $t_2,$a2 2002 $SLL $t_1,1 2003 $ADDU $c_2,$t_1 2004 sltu $at,$c_2,$t_1 2005 $ADDU $t_2,$at 2006 $ADDU $c_3,$t_2 2007 sltu $at,$c_3,$t_2 2008 $ADDU $c_1,$at 2009 mflo $t_1 2010 mfhi $t_2 2011 $ADDU $c_2,$t_1 2012 sltu $at,$c_2,$t_1 2013 $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); 2014 $ADDU $t_2,$at 2015 $ADDU $c_3,$t_2 2016 sltu $at,$c_3,$t_2 2017 $ADDU $c_1,$at 2018 $ST $c_2,4*$BNSZ($a0) 2019 2020 mflo $t_1 2021 mfhi $t_2 2022 slt $c_2,$t_2,$zero 2023 $SLL $t_2,1 2024 $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2); 2025 slt $a2,$t_1,$zero 2026 $ADDU $t_2,$a2 2027 $SLL $t_1,1 2028 $ADDU $c_3,$t_1 2029 sltu $at,$c_3,$t_1 2030 $ADDU $t_2,$at 2031 $ADDU $c_1,$t_2 2032 sltu $at,$c_1,$t_2 2033 $ADDU $c_2,$at 2034 mflo $t_1 2035 mfhi $t_2 2036 slt $at,$t_2,$zero 2037 $ADDU $c_2,$at 2038 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); 2039 $SLL $t_2,1 2040 slt $a2,$t_1,$zero 2041 $ADDU $t_2,$a2 2042 $SLL $t_1,1 2043 $ADDU $c_3,$t_1 2044 sltu $at,$c_3,$t_1 2045 $ADDU $t_2,$at 2046 $ADDU $c_1,$t_2 2047 sltu $at,$c_1,$t_2 2048 $ADDU $c_2,$at 2049 mflo $t_1 2050 mfhi $t_2 2051 slt $at,$t_2,$zero 2052 $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3); 2053 $ADDU $c_2,$at 2054 $SLL $t_2,1 2055 slt $a2,$t_1,$zero 2056 $ADDU $t_2,$a2 2057 $SLL $t_1,1 2058 $ADDU $c_3,$t_1 2059 sltu $at,$c_3,$t_1 2060 $ADDU $t_2,$at 2061 $ADDU $c_1,$t_2 2062 sltu $at,$c_1,$t_2 2063 $ADDU $c_2,$at 2064 $ST $c_3,5*$BNSZ($a0) 2065 2066 mflo $t_1 2067 mfhi $t_2 2068 slt $c_3,$t_2,$zero 2069 $SLL $t_2,1 2070 $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3); 2071 slt $a2,$t_1,$zero 2072 $ADDU $t_2,$a2 2073 $SLL $t_1,1 2074 $ADDU $c_1,$t_1 2075 sltu $at,$c_1,$t_1 2076 $ADDU $t_2,$at 2077 $ADDU $c_2,$t_2 2078 sltu $at,$c_2,$t_2 2079 $ADDU $c_3,$at 2080 mflo $t_1 2081 mfhi $t_2 2082 slt $at,$t_2,$zero 2083 $ADDU $c_3,$at 2084 $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3); 2085 $SLL $t_2,1 2086 slt $a2,$t_1,$zero 2087 $ADDU $t_2,$a2 2088 $SLL $t_1,1 2089 $ADDU $c_1,$t_1 2090 sltu $at,$c_1,$t_1 2091 $ADDU $t_2,$at 2092 $ADDU $c_2,$t_2 2093 sltu $at,$c_2,$t_2 2094 $ADDU $c_3,$at 2095 mflo $t_1 2096 mfhi $t_2 2097 slt $at,$t_2,$zero 2098 $ADDU $c_3,$at 2099 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); 2100 $SLL $t_2,1 2101 slt $a2,$t_1,$zero 2102 $ADDU $t_2,$a2 2103 $SLL $t_1,1 2104 $ADDU $c_1,$t_1 2105 sltu $at,$c_1,$t_1 2106 $ADDU $t_2,$at 2107 $ADDU $c_2,$t_2 2108 sltu $at,$c_2,$t_2 2109 $ADDU $c_3,$at 2110 mflo $t_1 2111 mfhi $t_2 2112 $ADDU $c_1,$t_1 2113 sltu $at,$c_1,$t_1 2114 $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); 2115 $ADDU $t_2,$at 2116 $ADDU $c_2,$t_2 2117 sltu $at,$c_2,$t_2 2118 $ADDU $c_3,$at 2119 $ST $c_1,6*$BNSZ($a0) 2120 2121 mflo $t_1 2122 mfhi $t_2 2123 slt $c_1,$t_2,$zero 2124 $SLL $t_2,1 2125 $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1); 2126 slt $a2,$t_1,$zero 2127 $ADDU $t_2,$a2 2128 $SLL $t_1,1 2129 $ADDU $c_2,$t_1 2130 sltu $at,$c_2,$t_1 2131 $ADDU $t_2,$at 2132 $ADDU $c_3,$t_2 2133 sltu $at,$c_3,$t_2 2134 $ADDU $c_1,$at 2135 mflo $t_1 2136 mfhi $t_2 2137 slt $at,$t_2,$zero 2138 $ADDU $c_1,$at 2139 $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1); 2140 $SLL $t_2,1 2141 slt $a2,$t_1,$zero 2142 $ADDU $t_2,$a2 2143 $SLL $t_1,1 2144 $ADDU $c_2,$t_1 2145 sltu $at,$c_2,$t_1 2146 $ADDU $t_2,$at 2147 $ADDU $c_3,$t_2 2148 sltu $at,$c_3,$t_2 2149 $ADDU $c_1,$at 2150 mflo $t_1 2151 mfhi $t_2 2152 slt $at,$t_2,$zero 2153 $ADDU $c_1,$at 2154 $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1); 2155 $SLL $t_2,1 2156 slt $a2,$t_1,$zero 2157 $ADDU $t_2,$a2 2158 $SLL $t_1,1 2159 $ADDU $c_2,$t_1 2160 sltu $at,$c_2,$t_1 2161 $ADDU $t_2,$at 2162 $ADDU $c_3,$t_2 2163 sltu $at,$c_3,$t_2 2164 $ADDU $c_1,$at 2165 mflo $t_1 2166 mfhi $t_2 2167 slt $at,$t_2,$zero 2168 $ADDU $c_1,$at 2169 $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2); 2170 $SLL $t_2,1 2171 slt $a2,$t_1,$zero 2172 $ADDU $t_2,$a2 2173 $SLL $t_1,1 2174 $ADDU $c_2,$t_1 2175 sltu $at,$c_2,$t_1 2176 $ADDU $t_2,$at 2177 $ADDU $c_3,$t_2 2178 sltu $at,$c_3,$t_2 2179 $ADDU $c_1,$at 2180 $ST $c_2,7*$BNSZ($a0) 2181 2182 mflo $t_1 2183 mfhi $t_2 2184 slt $c_2,$t_2,$zero 2185 $SLL $t_2,1 2186 $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2); 2187 slt $a2,$t_1,$zero 2188 $ADDU $t_2,$a2 2189 $SLL $t_1,1 2190 $ADDU $c_3,$t_1 2191 sltu $at,$c_3,$t_1 2192 $ADDU $t_2,$at 2193 $ADDU $c_1,$t_2 2194 sltu $at,$c_1,$t_2 2195 $ADDU $c_2,$at 2196 mflo $t_1 2197 mfhi $t_2 2198 slt $at,$t_2,$zero 2199 $ADDU $c_2,$at 2200 $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2); 2201 $SLL $t_2,1 2202 slt $a2,$t_1,$zero 2203 $ADDU $t_2,$a2 2204 $SLL $t_1,1 2205 $ADDU $c_3,$t_1 2206 sltu $at,$c_3,$t_1 2207 $ADDU $t_2,$at 2208 $ADDU $c_1,$t_2 2209 sltu $at,$c_1,$t_2 2210 $ADDU $c_2,$at 2211 mflo $t_1 2212 mfhi $t_2 2213 slt $at,$t_2,$zero 2214 $ADDU $c_2,$at 2215 $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2); 2216 $SLL $t_2,1 2217 slt $a2,$t_1,$zero 2218 $ADDU $t_2,$a2 2219 $SLL $t_1,1 2220 $ADDU $c_3,$t_1 2221 sltu $at,$c_3,$t_1 2222 $ADDU $t_2,$at 2223 $ADDU $c_1,$t_2 2224 sltu $at,$c_1,$t_2 2225 $ADDU $c_2,$at 2226 mflo $t_1 2227 mfhi $t_2 2228 $ADDU $c_3,$t_1 2229 sltu $at,$c_3,$t_1 2230 $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); 2231 $ADDU $t_2,$at 2232 $ADDU $c_1,$t_2 2233 sltu $at,$c_1,$t_2 2234 $ADDU $c_2,$at 2235 $ST $c_3,8*$BNSZ($a0) 2236 2237 mflo $t_1 2238 mfhi $t_2 2239 slt $c_3,$t_2,$zero 2240 $SLL $t_2,1 2241 $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3); 2242 slt $a2,$t_1,$zero 2243 $ADDU $t_2,$a2 2244 $SLL $t_1,1 2245 $ADDU $c_1,$t_1 2246 sltu $at,$c_1,$t_1 2247 $ADDU $t_2,$at 2248 $ADDU $c_2,$t_2 2249 sltu $at,$c_2,$t_2 2250 $ADDU $c_3,$at 2251 mflo $t_1 2252 mfhi $t_2 2253 slt $at,$t_2,$zero 2254 $ADDU $c_3,$at 2255 $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3); 2256 $SLL $t_2,1 2257 slt $a2,$t_1,$zero 2258 $ADDU $t_2,$a2 2259 $SLL $t_1,1 2260 $ADDU $c_1,$t_1 2261 sltu $at,$c_1,$t_1 2262 $ADDU $t_2,$at 2263 $ADDU $c_2,$t_2 2264 sltu $at,$c_2,$t_2 2265 $ADDU $c_3,$at 2266 mflo $t_1 2267 mfhi $t_2 2268 slt $at,$t_2,$zero 2269 $ADDU $c_3,$at 2270 $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1); 2271 $SLL $t_2,1 2272 slt $a2,$t_1,$zero 2273 $ADDU $t_2,$a2 2274 $SLL $t_1,1 2275 $ADDU $c_1,$t_1 2276 sltu $at,$c_1,$t_1 2277 $ADDU $t_2,$at 2278 $ADDU $c_2,$t_2 2279 sltu $at,$c_2,$t_2 2280 $ADDU $c_3,$at 2281 $ST $c_1,9*$BNSZ($a0) 2282 2283 mflo $t_1 2284 mfhi $t_2 2285 slt $c_1,$t_2,$zero 2286 $SLL $t_2,1 2287 $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1); 2288 slt $a2,$t_1,$zero 2289 $ADDU $t_2,$a2 2290 $SLL $t_1,1 2291 $ADDU $c_2,$t_1 2292 sltu $at,$c_2,$t_1 2293 $ADDU $t_2,$at 2294 $ADDU $c_3,$t_2 2295 sltu $at,$c_3,$t_2 2296 $ADDU $c_1,$at 2297 mflo $t_1 2298 mfhi $t_2 2299 slt $at,$t_2,$zero 2300 $ADDU $c_1,$at 2301 $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1); 2302 $SLL $t_2,1 2303 slt $a2,$t_1,$zero 2304 $ADDU $t_2,$a2 2305 $SLL $t_1,1 2306 $ADDU $c_2,$t_1 2307 sltu $at,$c_2,$t_1 2308 $ADDU $t_2,$at 2309 $ADDU $c_3,$t_2 2310 sltu $at,$c_3,$t_2 2311 $ADDU $c_1,$at 2312 mflo $t_1 2313 mfhi $t_2 2314 $ADDU $c_2,$t_1 2315 sltu $at,$c_2,$t_1 2316 $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); 2317 $ADDU $t_2,$at 2318 $ADDU $c_3,$t_2 2319 sltu $at,$c_3,$t_2 2320 $ADDU $c_1,$at 2321 $ST $c_2,10*$BNSZ($a0) 2322 2323 mflo $t_1 2324 mfhi $t_2 2325 slt $c_2,$t_2,$zero 2326 $SLL $t_2,1 2327 $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2); 2328 slt $a2,$t_1,$zero 2329 $ADDU $t_2,$a2 2330 $SLL $t_1,1 2331 $ADDU $c_3,$t_1 2332 sltu $at,$c_3,$t_1 2333 $ADDU $t_2,$at 2334 $ADDU $c_1,$t_2 2335 sltu $at,$c_1,$t_2 2336 $ADDU $c_2,$at 2337 mflo $t_1 2338 mfhi $t_2 2339 slt $at,$t_2,$zero 2340 $ADDU $c_2,$at 2341 $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3); 2342 $SLL $t_2,1 2343 slt $a2,$t_1,$zero 2344 $ADDU $t_2,$a2 2345 $SLL $t_1,1 2346 $ADDU $c_3,$t_1 2347 sltu $at,$c_3,$t_1 2348 $ADDU $t_2,$at 2349 $ADDU $c_1,$t_2 2350 sltu $at,$c_1,$t_2 2351 $ADDU $c_2,$at 2352 $ST $c_3,11*$BNSZ($a0) 2353 2354 mflo $t_1 2355 mfhi $t_2 2356 slt $c_3,$t_2,$zero 2357 $SLL $t_2,1 2358 $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3); 2359 slt $a2,$t_1,$zero 2360 $ADDU $t_2,$a2 2361 $SLL $t_1,1 2362 $ADDU $c_1,$t_1 2363 sltu $at,$c_1,$t_1 2364 $ADDU $t_2,$at 2365 $ADDU $c_2,$t_2 2366 sltu $at,$c_2,$t_2 2367 $ADDU $c_3,$at 2368 mflo $t_1 2369 mfhi $t_2 2370 $ADDU $c_1,$t_1 2371 sltu $at,$c_1,$t_1 2372 $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); 2373 $ADDU $t_2,$at 2374 $ADDU $c_2,$t_2 2375 sltu $at,$c_2,$t_2 2376 $ADDU $c_3,$at 2377 $ST $c_1,12*$BNSZ($a0) 2378 2379 mflo $t_1 2380 mfhi $t_2 2381 slt $c_1,$t_2,$zero 2382 $SLL $t_2,1 2383 $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2); 2384 slt $a2,$t_1,$zero 2385 $ADDU $t_2,$a2 2386 $SLL $t_1,1 2387 $ADDU $c_2,$t_1 2388 sltu $at,$c_2,$t_1 2389 $ADDU $t_2,$at 2390 $ADDU $c_3,$t_2 2391 sltu $at,$c_3,$t_2 2392 $ADDU $c_1,$at 2393 $ST $c_2,13*$BNSZ($a0) 2394 2395 mflo $t_1 2396 mfhi $t_2 2397 $ADDU $c_3,$t_1 2398 sltu $at,$c_3,$t_1 2399 $ADDU $t_2,$at 2400 $ADDU $c_1,$t_2 2401 $ST $c_3,14*$BNSZ($a0) 2402 $ST $c_1,15*$BNSZ($a0) 2403 2404 .set noreorder 2405___ 2406$code.=<<___ if ($flavour =~ /nubi/i); 2407 $REG_L $t3,4*$SZREG($sp) 2408 $REG_L $t2,3*$SZREG($sp) 2409 $REG_L $t1,2*$SZREG($sp) 2410 $REG_L $t0,1*$SZREG($sp) 2411 $REG_L $gp,0*$SZREG($sp) 2412 $PTR_ADD $sp,6*$SZREG 2413___ 2414$code.=<<___; 2415 jr $ra 2416 nop 2417.end bn_sqr_comba8 2418 2419.align 5 2420.globl bn_sqr_comba4 2421.ent bn_sqr_comba4 2422bn_sqr_comba4: 2423___ 2424$code.=<<___ if ($flavour =~ /nubi/i); 2425 .frame $sp,6*$SZREG,$ra 2426 .mask 0x8000f008,-$SZREG 2427 .set noreorder 2428 $PTR_SUB $sp,6*$SZREG 2429 $REG_S $ra,5*$SZREG($sp) 2430 $REG_S $t3,4*$SZREG($sp) 2431 $REG_S $t2,3*$SZREG($sp) 2432 $REG_S $t1,2*$SZREG($sp) 2433 $REG_S $t0,1*$SZREG($sp) 2434 $REG_S $gp,0*$SZREG($sp) 2435___ 2436$code.=<<___; 2437 .set reorder 2438 $LD $a_0,0($a1) 2439 $LD $a_1,$BNSZ($a1) 2440 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); 2441 $LD $a_2,2*$BNSZ($a1) 2442 $LD $a_3,3*$BNSZ($a1) 2443 mflo $c_1 2444 mfhi $c_2 2445 $ST $c_1,0($a0) 2446 2447 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); 2448 mflo $t_1 2449 mfhi $t_2 2450 slt $c_1,$t_2,$zero 2451 $SLL $t_2,1 2452 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); 2453 slt $a2,$t_1,$zero 2454 $ADDU $t_2,$a2 2455 $SLL $t_1,1 2456 $ADDU $c_2,$t_1 2457 sltu $at,$c_2,$t_1 2458 $ADDU $c_3,$t_2,$at 2459 $ST $c_2,$BNSZ($a0) 2460 2461 mflo $t_1 2462 mfhi $t_2 2463 slt $c_2,$t_2,$zero 2464 $SLL $t_2,1 2465 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); 2466 slt $a2,$t_1,$zero 2467 $ADDU $t_2,$a2 2468 $SLL $t_1,1 2469 $ADDU $c_3,$t_1 2470 sltu $at,$c_3,$t_1 2471 $ADDU $t_2,$at 2472 $ADDU $c_1,$t_2 2473 sltu $at,$c_1,$t_2 2474 $ADDU $c_2,$at 2475 mflo $t_1 2476 mfhi $t_2 2477 $ADDU $c_3,$t_1 2478 sltu $at,$c_3,$t_1 2479 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); 2480 $ADDU $t_2,$at 2481 $ADDU $c_1,$t_2 2482 sltu $at,$c_1,$t_2 2483 $ADDU $c_2,$at 2484 $ST $c_3,2*$BNSZ($a0) 2485 2486 mflo $t_1 2487 mfhi $t_2 2488 slt $c_3,$t_2,$zero 2489 $SLL $t_2,1 2490 $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3); 2491 slt $a2,$t_1,$zero 2492 $ADDU $t_2,$a2 2493 $SLL $t_1,1 2494 $ADDU $c_1,$t_1 2495 sltu $at,$c_1,$t_1 2496 $ADDU $t_2,$at 2497 $ADDU $c_2,$t_2 2498 sltu $at,$c_2,$t_2 2499 $ADDU $c_3,$at 2500 mflo $t_1 2501 mfhi $t_2 2502 slt $at,$t_2,$zero 2503 $ADDU $c_3,$at 2504 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); 2505 $SLL $t_2,1 2506 slt $a2,$t_1,$zero 2507 $ADDU $t_2,$a2 2508 $SLL $t_1,1 2509 $ADDU $c_1,$t_1 2510 sltu $at,$c_1,$t_1 2511 $ADDU $t_2,$at 2512 $ADDU $c_2,$t_2 2513 sltu $at,$c_2,$t_2 2514 $ADDU $c_3,$at 2515 $ST $c_1,3*$BNSZ($a0) 2516 2517 mflo $t_1 2518 mfhi $t_2 2519 slt $c_1,$t_2,$zero 2520 $SLL $t_2,1 2521 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); 2522 slt $a2,$t_1,$zero 2523 $ADDU $t_2,$a2 2524 $SLL $t_1,1 2525 $ADDU $c_2,$t_1 2526 sltu $at,$c_2,$t_1 2527 $ADDU $t_2,$at 2528 $ADDU $c_3,$t_2 2529 sltu $at,$c_3,$t_2 2530 $ADDU $c_1,$at 2531 mflo $t_1 2532 mfhi $t_2 2533 $ADDU $c_2,$t_1 2534 sltu $at,$c_2,$t_1 2535 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); 2536 $ADDU $t_2,$at 2537 $ADDU $c_3,$t_2 2538 sltu $at,$c_3,$t_2 2539 $ADDU $c_1,$at 2540 $ST $c_2,4*$BNSZ($a0) 2541 2542 mflo $t_1 2543 mfhi $t_2 2544 slt $c_2,$t_2,$zero 2545 $SLL $t_2,1 2546 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); 2547 slt $a2,$t_1,$zero 2548 $ADDU $t_2,$a2 2549 $SLL $t_1,1 2550 $ADDU $c_3,$t_1 2551 sltu $at,$c_3,$t_1 2552 $ADDU $t_2,$at 2553 $ADDU $c_1,$t_2 2554 sltu $at,$c_1,$t_2 2555 $ADDU $c_2,$at 2556 $ST $c_3,5*$BNSZ($a0) 2557 2558 mflo $t_1 2559 mfhi $t_2 2560 $ADDU $c_1,$t_1 2561 sltu $at,$c_1,$t_1 2562 $ADDU $t_2,$at 2563 $ADDU $c_2,$t_2 2564 $ST $c_1,6*$BNSZ($a0) 2565 $ST $c_2,7*$BNSZ($a0) 2566 2567 .set noreorder 2568___ 2569$code.=<<___ if ($flavour =~ /nubi/i); 2570 $REG_L $t3,4*$SZREG($sp) 2571 $REG_L $t2,3*$SZREG($sp) 2572 $REG_L $t1,2*$SZREG($sp) 2573 $REG_L $t0,1*$SZREG($sp) 2574 $REG_L $gp,0*$SZREG($sp) 2575 $PTR_ADD $sp,6*$SZREG 2576___ 2577$code.=<<___; 2578 jr $ra 2579 nop 2580.end bn_sqr_comba4 2581___ 2582print $code; 2583close STDOUT; 2584