1238384Sjkim#!/usr/bin/env perl 2238384Sjkim# 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. 6238384Sjkim# 7238384Sjkim# Rights for redistribution and usage in source and binary forms are 8238384Sjkim# granted according to the OpenSSL license. Warranty of any kind is 9238384Sjkim# disclaimed. 10238384Sjkim# ==================================================================== 11238384Sjkim 12238384Sjkim 13238384Sjkim# July 1999 14238384Sjkim# 15238384Sjkim# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. 16238384Sjkim# 17238384Sjkim# The module is designed to work with either of the "new" MIPS ABI(5), 18238384Sjkim# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under 19238384Sjkim# IRIX 5.x not only because it doesn't support new ABIs but also 20238384Sjkim# because 5.x kernels put R4x00 CPU into 32-bit mode and all those 21238384Sjkim# 64-bit instructions (daddu, dmultu, etc.) found below gonna only 22238384Sjkim# cause illegal instruction exception:-( 23238384Sjkim# 24238384Sjkim# In addition the code depends on preprocessor flags set up by MIPSpro 25238384Sjkim# compiler driver (either as or cc) and therefore (probably?) can't be 26238384Sjkim# compiled by the GNU assembler. GNU C driver manages fine though... 27238384Sjkim# I mean as long as -mmips-as is specified or is the default option, 28238384Sjkim# because then it simply invokes /usr/bin/as which in turn takes 29238384Sjkim# perfect care of the preprocessor definitions. Another neat feature 30238384Sjkim# offered by the MIPSpro assembler is an optimization pass. This gave 31238384Sjkim# me the opportunity to have the code looking more regular as all those 32238384Sjkim# architecture dependent instruction rescheduling details were left to 33238384Sjkim# the assembler. Cool, huh? 34238384Sjkim# 35238384Sjkim# Performance improvement is astonishing! 'apps/openssl speed rsa dsa' 36238384Sjkim# goes way over 3 times faster! 37238384Sjkim# 38238384Sjkim# <appro@fy.chalmers.se> 39238384Sjkim 40238384Sjkim# October 2010 41238384Sjkim# 42238384Sjkim# Adapt the module even for 32-bit ABIs and other OSes. The former was 43238384Sjkim# achieved by mechanical replacement of 64-bit arithmetic instructions 44238384Sjkim# such as dmultu, daddu, etc. with their 32-bit counterparts and 45238384Sjkim# adjusting offsets denoting multiples of BN_ULONG. Above mentioned 46238384Sjkim# >3x performance improvement naturally does not apply to 32-bit code 47238384Sjkim# [because there is no instruction 32-bit compiler can't use], one 48238384Sjkim# has to content with 40-85% improvement depending on benchmark and 49238384Sjkim# key length, more for longer keys. 50238384Sjkim 51238384Sjkim$flavour = shift; 52238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 53238384Sjkimopen STDOUT,">$output"; 54238384Sjkim 55238384Sjkimif ($flavour =~ /64|n32/i) { 56238384Sjkim $LD="ld"; 57238384Sjkim $ST="sd"; 58238384Sjkim $MULTU="dmultu"; 59238384Sjkim $DIVU="ddivu"; 60238384Sjkim $ADDU="daddu"; 61238384Sjkim $SUBU="dsubu"; 62238384Sjkim $SRL="dsrl"; 63238384Sjkim $SLL="dsll"; 64238384Sjkim $BNSZ=8; 65238384Sjkim $PTR_ADD="daddu"; 66238384Sjkim $PTR_SUB="dsubu"; 67238384Sjkim $SZREG=8; 68238384Sjkim $REG_S="sd"; 69238384Sjkim $REG_L="ld"; 70238384Sjkim} else { 71238384Sjkim $LD="lw"; 72238384Sjkim $ST="sw"; 73238384Sjkim $MULTU="multu"; 74238384Sjkim $DIVU="divu"; 75238384Sjkim $ADDU="addu"; 76238384Sjkim $SUBU="subu"; 77238384Sjkim $SRL="srl"; 78238384Sjkim $SLL="sll"; 79238384Sjkim $BNSZ=4; 80238384Sjkim $PTR_ADD="addu"; 81238384Sjkim $PTR_SUB="subu"; 82238384Sjkim $SZREG=4; 83238384Sjkim $REG_S="sw"; 84238384Sjkim $REG_L="lw"; 85238384Sjkim $code=".set mips2\n"; 86238384Sjkim} 87238384Sjkim 88238384Sjkim# Below is N32/64 register layout used in the original module. 89238384Sjkim# 90238384Sjkim($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 91238384Sjkim($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 92238384Sjkim($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 93238384Sjkim($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 94238384Sjkim($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 95238384Sjkim($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); 96238384Sjkim# 97238384Sjkim# No special adaptation is required for O32. NUBI on the other hand 98238384Sjkim# is treated by saving/restoring ($v1,$t0..$t3). 99238384Sjkim 100238384Sjkim$gp=$v1 if ($flavour =~ /nubi/i); 101238384Sjkim 102238384Sjkim$minus4=$v1; 103238384Sjkim 104238384Sjkim$code.=<<___; 105238384Sjkim.rdata 106238384Sjkim.asciiz "mips3.s, Version 1.2" 107238384Sjkim.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" 108238384Sjkim 109238384Sjkim.text 110238384Sjkim.set noat 111238384Sjkim 112238384Sjkim.align 5 113238384Sjkim.globl bn_mul_add_words 114238384Sjkim.ent bn_mul_add_words 115238384Sjkimbn_mul_add_words: 116238384Sjkim .set noreorder 117238384Sjkim bgtz $a2,bn_mul_add_words_internal 118238384Sjkim move $v0,$zero 119238384Sjkim jr $ra 120238384Sjkim move $a0,$v0 121238384Sjkim.end bn_mul_add_words 122238384Sjkim 123238384Sjkim.align 5 124238384Sjkim.ent bn_mul_add_words_internal 125238384Sjkimbn_mul_add_words_internal: 126238384Sjkim___ 127238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 128238384Sjkim .frame $sp,6*$SZREG,$ra 129238384Sjkim .mask 0x8000f008,-$SZREG 130238384Sjkim .set noreorder 131238384Sjkim $PTR_SUB $sp,6*$SZREG 132238384Sjkim $REG_S $ra,5*$SZREG($sp) 133238384Sjkim $REG_S $t3,4*$SZREG($sp) 134238384Sjkim $REG_S $t2,3*$SZREG($sp) 135238384Sjkim $REG_S $t1,2*$SZREG($sp) 136238384Sjkim $REG_S $t0,1*$SZREG($sp) 137238384Sjkim $REG_S $gp,0*$SZREG($sp) 138238384Sjkim___ 139238384Sjkim$code.=<<___; 140238384Sjkim .set reorder 141238384Sjkim li $minus4,-4 142238384Sjkim and $ta0,$a2,$minus4 143238384Sjkim beqz $ta0,.L_bn_mul_add_words_tail 144238384Sjkim 145238384Sjkim.L_bn_mul_add_words_loop: 146264331Sjkim $LD $t0,0($a1) 147238384Sjkim $MULTU $t0,$a3 148238384Sjkim $LD $t1,0($a0) 149238384Sjkim $LD $t2,$BNSZ($a1) 150238384Sjkim $LD $t3,$BNSZ($a0) 151238384Sjkim $LD $ta0,2*$BNSZ($a1) 152238384Sjkim $LD $ta1,2*$BNSZ($a0) 153238384Sjkim $ADDU $t1,$v0 154238384Sjkim sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit 155238384Sjkim # values", but it seems to work fine 156238384Sjkim # even on 64-bit registers. 157238384Sjkim mflo $at 158238384Sjkim mfhi $t0 159238384Sjkim $ADDU $t1,$at 160238384Sjkim $ADDU $v0,$t0 161238384Sjkim $MULTU $t2,$a3 162238384Sjkim sltu $at,$t1,$at 163238384Sjkim $ST $t1,0($a0) 164238384Sjkim $ADDU $v0,$at 165238384Sjkim 166238384Sjkim $LD $ta2,3*$BNSZ($a1) 167238384Sjkim $LD $ta3,3*$BNSZ($a0) 168238384Sjkim $ADDU $t3,$v0 169238384Sjkim sltu $v0,$t3,$v0 170238384Sjkim mflo $at 171238384Sjkim mfhi $t2 172238384Sjkim $ADDU $t3,$at 173238384Sjkim $ADDU $v0,$t2 174238384Sjkim $MULTU $ta0,$a3 175238384Sjkim sltu $at,$t3,$at 176238384Sjkim $ST $t3,$BNSZ($a0) 177238384Sjkim $ADDU $v0,$at 178238384Sjkim 179238384Sjkim subu $a2,4 180238384Sjkim $PTR_ADD $a0,4*$BNSZ 181238384Sjkim $PTR_ADD $a1,4*$BNSZ 182238384Sjkim $ADDU $ta1,$v0 183238384Sjkim sltu $v0,$ta1,$v0 184238384Sjkim mflo $at 185238384Sjkim mfhi $ta0 186238384Sjkim $ADDU $ta1,$at 187238384Sjkim $ADDU $v0,$ta0 188238384Sjkim $MULTU $ta2,$a3 189238384Sjkim sltu $at,$ta1,$at 190238384Sjkim $ST $ta1,-2*$BNSZ($a0) 191238384Sjkim $ADDU $v0,$at 192238384Sjkim 193238384Sjkim 194238384Sjkim and $ta0,$a2,$minus4 195238384Sjkim $ADDU $ta3,$v0 196238384Sjkim sltu $v0,$ta3,$v0 197238384Sjkim mflo $at 198238384Sjkim mfhi $ta2 199238384Sjkim $ADDU $ta3,$at 200238384Sjkim $ADDU $v0,$ta2 201238384Sjkim sltu $at,$ta3,$at 202238384Sjkim $ST $ta3,-$BNSZ($a0) 203264331Sjkim .set noreorder 204264331Sjkim bgtz $ta0,.L_bn_mul_add_words_loop 205238384Sjkim $ADDU $v0,$at 206238384Sjkim 207238384Sjkim beqz $a2,.L_bn_mul_add_words_return 208238384Sjkim nop 209238384Sjkim 210238384Sjkim.L_bn_mul_add_words_tail: 211238384Sjkim .set reorder 212238384Sjkim $LD $t0,0($a1) 213238384Sjkim $MULTU $t0,$a3 214238384Sjkim $LD $t1,0($a0) 215238384Sjkim subu $a2,1 216238384Sjkim $ADDU $t1,$v0 217238384Sjkim sltu $v0,$t1,$v0 218238384Sjkim mflo $at 219238384Sjkim mfhi $t0 220238384Sjkim $ADDU $t1,$at 221238384Sjkim $ADDU $v0,$t0 222238384Sjkim sltu $at,$t1,$at 223238384Sjkim $ST $t1,0($a0) 224238384Sjkim $ADDU $v0,$at 225238384Sjkim beqz $a2,.L_bn_mul_add_words_return 226238384Sjkim 227238384Sjkim $LD $t0,$BNSZ($a1) 228238384Sjkim $MULTU $t0,$a3 229238384Sjkim $LD $t1,$BNSZ($a0) 230238384Sjkim subu $a2,1 231238384Sjkim $ADDU $t1,$v0 232238384Sjkim sltu $v0,$t1,$v0 233238384Sjkim mflo $at 234238384Sjkim mfhi $t0 235238384Sjkim $ADDU $t1,$at 236238384Sjkim $ADDU $v0,$t0 237238384Sjkim sltu $at,$t1,$at 238238384Sjkim $ST $t1,$BNSZ($a0) 239238384Sjkim $ADDU $v0,$at 240238384Sjkim beqz $a2,.L_bn_mul_add_words_return 241238384Sjkim 242238384Sjkim $LD $t0,2*$BNSZ($a1) 243238384Sjkim $MULTU $t0,$a3 244238384Sjkim $LD $t1,2*$BNSZ($a0) 245238384Sjkim $ADDU $t1,$v0 246238384Sjkim sltu $v0,$t1,$v0 247238384Sjkim mflo $at 248238384Sjkim mfhi $t0 249238384Sjkim $ADDU $t1,$at 250238384Sjkim $ADDU $v0,$t0 251238384Sjkim sltu $at,$t1,$at 252238384Sjkim $ST $t1,2*$BNSZ($a0) 253238384Sjkim $ADDU $v0,$at 254238384Sjkim 255238384Sjkim.L_bn_mul_add_words_return: 256238384Sjkim .set noreorder 257238384Sjkim___ 258238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 259238384Sjkim $REG_L $t3,4*$SZREG($sp) 260238384Sjkim $REG_L $t2,3*$SZREG($sp) 261238384Sjkim $REG_L $t1,2*$SZREG($sp) 262238384Sjkim $REG_L $t0,1*$SZREG($sp) 263238384Sjkim $REG_L $gp,0*$SZREG($sp) 264238384Sjkim $PTR_ADD $sp,6*$SZREG 265238384Sjkim___ 266238384Sjkim$code.=<<___; 267238384Sjkim jr $ra 268238384Sjkim move $a0,$v0 269238384Sjkim.end bn_mul_add_words_internal 270238384Sjkim 271238384Sjkim.align 5 272238384Sjkim.globl bn_mul_words 273238384Sjkim.ent bn_mul_words 274238384Sjkimbn_mul_words: 275238384Sjkim .set noreorder 276238384Sjkim bgtz $a2,bn_mul_words_internal 277238384Sjkim move $v0,$zero 278238384Sjkim jr $ra 279238384Sjkim move $a0,$v0 280238384Sjkim.end bn_mul_words 281238384Sjkim 282238384Sjkim.align 5 283238384Sjkim.ent bn_mul_words_internal 284238384Sjkimbn_mul_words_internal: 285238384Sjkim___ 286238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 287238384Sjkim .frame $sp,6*$SZREG,$ra 288238384Sjkim .mask 0x8000f008,-$SZREG 289238384Sjkim .set noreorder 290238384Sjkim $PTR_SUB $sp,6*$SZREG 291238384Sjkim $REG_S $ra,5*$SZREG($sp) 292238384Sjkim $REG_S $t3,4*$SZREG($sp) 293238384Sjkim $REG_S $t2,3*$SZREG($sp) 294238384Sjkim $REG_S $t1,2*$SZREG($sp) 295238384Sjkim $REG_S $t0,1*$SZREG($sp) 296238384Sjkim $REG_S $gp,0*$SZREG($sp) 297238384Sjkim___ 298238384Sjkim$code.=<<___; 299238384Sjkim .set reorder 300238384Sjkim li $minus4,-4 301238384Sjkim and $ta0,$a2,$minus4 302238384Sjkim beqz $ta0,.L_bn_mul_words_tail 303238384Sjkim 304238384Sjkim.L_bn_mul_words_loop: 305264331Sjkim $LD $t0,0($a1) 306238384Sjkim $MULTU $t0,$a3 307238384Sjkim $LD $t2,$BNSZ($a1) 308238384Sjkim $LD $ta0,2*$BNSZ($a1) 309238384Sjkim $LD $ta2,3*$BNSZ($a1) 310238384Sjkim mflo $at 311238384Sjkim mfhi $t0 312238384Sjkim $ADDU $v0,$at 313238384Sjkim sltu $t1,$v0,$at 314238384Sjkim $MULTU $t2,$a3 315238384Sjkim $ST $v0,0($a0) 316238384Sjkim $ADDU $v0,$t1,$t0 317238384Sjkim 318238384Sjkim subu $a2,4 319238384Sjkim $PTR_ADD $a0,4*$BNSZ 320238384Sjkim $PTR_ADD $a1,4*$BNSZ 321238384Sjkim mflo $at 322238384Sjkim mfhi $t2 323238384Sjkim $ADDU $v0,$at 324238384Sjkim sltu $t3,$v0,$at 325238384Sjkim $MULTU $ta0,$a3 326238384Sjkim $ST $v0,-3*$BNSZ($a0) 327238384Sjkim $ADDU $v0,$t3,$t2 328238384Sjkim 329238384Sjkim mflo $at 330238384Sjkim mfhi $ta0 331238384Sjkim $ADDU $v0,$at 332238384Sjkim sltu $ta1,$v0,$at 333238384Sjkim $MULTU $ta2,$a3 334238384Sjkim $ST $v0,-2*$BNSZ($a0) 335238384Sjkim $ADDU $v0,$ta1,$ta0 336238384Sjkim 337238384Sjkim and $ta0,$a2,$minus4 338238384Sjkim mflo $at 339238384Sjkim mfhi $ta2 340238384Sjkim $ADDU $v0,$at 341238384Sjkim sltu $ta3,$v0,$at 342238384Sjkim $ST $v0,-$BNSZ($a0) 343264331Sjkim .set noreorder 344264331Sjkim bgtz $ta0,.L_bn_mul_words_loop 345238384Sjkim $ADDU $v0,$ta3,$ta2 346238384Sjkim 347238384Sjkim beqz $a2,.L_bn_mul_words_return 348238384Sjkim nop 349238384Sjkim 350238384Sjkim.L_bn_mul_words_tail: 351238384Sjkim .set reorder 352238384Sjkim $LD $t0,0($a1) 353238384Sjkim $MULTU $t0,$a3 354238384Sjkim subu $a2,1 355238384Sjkim mflo $at 356238384Sjkim mfhi $t0 357238384Sjkim $ADDU $v0,$at 358238384Sjkim sltu $t1,$v0,$at 359238384Sjkim $ST $v0,0($a0) 360238384Sjkim $ADDU $v0,$t1,$t0 361238384Sjkim beqz $a2,.L_bn_mul_words_return 362238384Sjkim 363238384Sjkim $LD $t0,$BNSZ($a1) 364238384Sjkim $MULTU $t0,$a3 365238384Sjkim subu $a2,1 366238384Sjkim mflo $at 367238384Sjkim mfhi $t0 368238384Sjkim $ADDU $v0,$at 369238384Sjkim sltu $t1,$v0,$at 370238384Sjkim $ST $v0,$BNSZ($a0) 371238384Sjkim $ADDU $v0,$t1,$t0 372238384Sjkim beqz $a2,.L_bn_mul_words_return 373238384Sjkim 374238384Sjkim $LD $t0,2*$BNSZ($a1) 375238384Sjkim $MULTU $t0,$a3 376238384Sjkim mflo $at 377238384Sjkim mfhi $t0 378238384Sjkim $ADDU $v0,$at 379238384Sjkim sltu $t1,$v0,$at 380238384Sjkim $ST $v0,2*$BNSZ($a0) 381238384Sjkim $ADDU $v0,$t1,$t0 382238384Sjkim 383238384Sjkim.L_bn_mul_words_return: 384238384Sjkim .set noreorder 385238384Sjkim___ 386238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 387238384Sjkim $REG_L $t3,4*$SZREG($sp) 388238384Sjkim $REG_L $t2,3*$SZREG($sp) 389238384Sjkim $REG_L $t1,2*$SZREG($sp) 390238384Sjkim $REG_L $t0,1*$SZREG($sp) 391238384Sjkim $REG_L $gp,0*$SZREG($sp) 392238384Sjkim $PTR_ADD $sp,6*$SZREG 393238384Sjkim___ 394238384Sjkim$code.=<<___; 395238384Sjkim jr $ra 396238384Sjkim move $a0,$v0 397238384Sjkim.end bn_mul_words_internal 398238384Sjkim 399238384Sjkim.align 5 400238384Sjkim.globl bn_sqr_words 401238384Sjkim.ent bn_sqr_words 402238384Sjkimbn_sqr_words: 403238384Sjkim .set noreorder 404238384Sjkim bgtz $a2,bn_sqr_words_internal 405238384Sjkim move $v0,$zero 406238384Sjkim jr $ra 407238384Sjkim move $a0,$v0 408238384Sjkim.end bn_sqr_words 409238384Sjkim 410238384Sjkim.align 5 411238384Sjkim.ent bn_sqr_words_internal 412238384Sjkimbn_sqr_words_internal: 413238384Sjkim___ 414238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 415238384Sjkim .frame $sp,6*$SZREG,$ra 416238384Sjkim .mask 0x8000f008,-$SZREG 417238384Sjkim .set noreorder 418238384Sjkim $PTR_SUB $sp,6*$SZREG 419238384Sjkim $REG_S $ra,5*$SZREG($sp) 420238384Sjkim $REG_S $t3,4*$SZREG($sp) 421238384Sjkim $REG_S $t2,3*$SZREG($sp) 422238384Sjkim $REG_S $t1,2*$SZREG($sp) 423238384Sjkim $REG_S $t0,1*$SZREG($sp) 424238384Sjkim $REG_S $gp,0*$SZREG($sp) 425238384Sjkim___ 426238384Sjkim$code.=<<___; 427238384Sjkim .set reorder 428238384Sjkim li $minus4,-4 429238384Sjkim and $ta0,$a2,$minus4 430238384Sjkim beqz $ta0,.L_bn_sqr_words_tail 431238384Sjkim 432238384Sjkim.L_bn_sqr_words_loop: 433264331Sjkim $LD $t0,0($a1) 434238384Sjkim $MULTU $t0,$t0 435238384Sjkim $LD $t2,$BNSZ($a1) 436238384Sjkim $LD $ta0,2*$BNSZ($a1) 437238384Sjkim $LD $ta2,3*$BNSZ($a1) 438238384Sjkim mflo $t1 439238384Sjkim mfhi $t0 440238384Sjkim $ST $t1,0($a0) 441238384Sjkim $ST $t0,$BNSZ($a0) 442238384Sjkim 443238384Sjkim $MULTU $t2,$t2 444238384Sjkim subu $a2,4 445238384Sjkim $PTR_ADD $a0,8*$BNSZ 446238384Sjkim $PTR_ADD $a1,4*$BNSZ 447238384Sjkim mflo $t3 448238384Sjkim mfhi $t2 449238384Sjkim $ST $t3,-6*$BNSZ($a0) 450238384Sjkim $ST $t2,-5*$BNSZ($a0) 451238384Sjkim 452238384Sjkim $MULTU $ta0,$ta0 453238384Sjkim mflo $ta1 454238384Sjkim mfhi $ta0 455238384Sjkim $ST $ta1,-4*$BNSZ($a0) 456238384Sjkim $ST $ta0,-3*$BNSZ($a0) 457238384Sjkim 458238384Sjkim 459238384Sjkim $MULTU $ta2,$ta2 460238384Sjkim and $ta0,$a2,$minus4 461238384Sjkim mflo $ta3 462238384Sjkim mfhi $ta2 463238384Sjkim $ST $ta3,-2*$BNSZ($a0) 464238384Sjkim 465238384Sjkim .set noreorder 466264331Sjkim bgtz $ta0,.L_bn_sqr_words_loop 467264331Sjkim $ST $ta2,-$BNSZ($a0) 468238384Sjkim 469238384Sjkim beqz $a2,.L_bn_sqr_words_return 470238384Sjkim nop 471238384Sjkim 472238384Sjkim.L_bn_sqr_words_tail: 473238384Sjkim .set reorder 474238384Sjkim $LD $t0,0($a1) 475238384Sjkim $MULTU $t0,$t0 476238384Sjkim subu $a2,1 477238384Sjkim mflo $t1 478238384Sjkim mfhi $t0 479238384Sjkim $ST $t1,0($a0) 480238384Sjkim $ST $t0,$BNSZ($a0) 481238384Sjkim beqz $a2,.L_bn_sqr_words_return 482238384Sjkim 483238384Sjkim $LD $t0,$BNSZ($a1) 484238384Sjkim $MULTU $t0,$t0 485238384Sjkim subu $a2,1 486238384Sjkim mflo $t1 487238384Sjkim mfhi $t0 488238384Sjkim $ST $t1,2*$BNSZ($a0) 489238384Sjkim $ST $t0,3*$BNSZ($a0) 490238384Sjkim beqz $a2,.L_bn_sqr_words_return 491238384Sjkim 492238384Sjkim $LD $t0,2*$BNSZ($a1) 493238384Sjkim $MULTU $t0,$t0 494238384Sjkim mflo $t1 495238384Sjkim mfhi $t0 496238384Sjkim $ST $t1,4*$BNSZ($a0) 497238384Sjkim $ST $t0,5*$BNSZ($a0) 498238384Sjkim 499238384Sjkim.L_bn_sqr_words_return: 500238384Sjkim .set noreorder 501238384Sjkim___ 502238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 503238384Sjkim $REG_L $t3,4*$SZREG($sp) 504238384Sjkim $REG_L $t2,3*$SZREG($sp) 505238384Sjkim $REG_L $t1,2*$SZREG($sp) 506238384Sjkim $REG_L $t0,1*$SZREG($sp) 507238384Sjkim $REG_L $gp,0*$SZREG($sp) 508238384Sjkim $PTR_ADD $sp,6*$SZREG 509238384Sjkim___ 510238384Sjkim$code.=<<___; 511238384Sjkim jr $ra 512238384Sjkim move $a0,$v0 513238384Sjkim 514238384Sjkim.end bn_sqr_words_internal 515238384Sjkim 516238384Sjkim.align 5 517238384Sjkim.globl bn_add_words 518238384Sjkim.ent bn_add_words 519238384Sjkimbn_add_words: 520238384Sjkim .set noreorder 521238384Sjkim bgtz $a3,bn_add_words_internal 522238384Sjkim move $v0,$zero 523238384Sjkim jr $ra 524238384Sjkim move $a0,$v0 525238384Sjkim.end bn_add_words 526238384Sjkim 527238384Sjkim.align 5 528238384Sjkim.ent bn_add_words_internal 529238384Sjkimbn_add_words_internal: 530238384Sjkim___ 531238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 532238384Sjkim .frame $sp,6*$SZREG,$ra 533238384Sjkim .mask 0x8000f008,-$SZREG 534238384Sjkim .set noreorder 535238384Sjkim $PTR_SUB $sp,6*$SZREG 536238384Sjkim $REG_S $ra,5*$SZREG($sp) 537238384Sjkim $REG_S $t3,4*$SZREG($sp) 538238384Sjkim $REG_S $t2,3*$SZREG($sp) 539238384Sjkim $REG_S $t1,2*$SZREG($sp) 540238384Sjkim $REG_S $t0,1*$SZREG($sp) 541238384Sjkim $REG_S $gp,0*$SZREG($sp) 542238384Sjkim___ 543238384Sjkim$code.=<<___; 544238384Sjkim .set reorder 545238384Sjkim li $minus4,-4 546238384Sjkim and $at,$a3,$minus4 547238384Sjkim beqz $at,.L_bn_add_words_tail 548238384Sjkim 549238384Sjkim.L_bn_add_words_loop: 550264331Sjkim $LD $t0,0($a1) 551238384Sjkim $LD $ta0,0($a2) 552238384Sjkim subu $a3,4 553238384Sjkim $LD $t1,$BNSZ($a1) 554238384Sjkim and $at,$a3,$minus4 555238384Sjkim $LD $t2,2*$BNSZ($a1) 556238384Sjkim $PTR_ADD $a2,4*$BNSZ 557238384Sjkim $LD $t3,3*$BNSZ($a1) 558238384Sjkim $PTR_ADD $a0,4*$BNSZ 559238384Sjkim $LD $ta1,-3*$BNSZ($a2) 560238384Sjkim $PTR_ADD $a1,4*$BNSZ 561238384Sjkim $LD $ta2,-2*$BNSZ($a2) 562238384Sjkim $LD $ta3,-$BNSZ($a2) 563238384Sjkim $ADDU $ta0,$t0 564238384Sjkim sltu $t8,$ta0,$t0 565238384Sjkim $ADDU $t0,$ta0,$v0 566238384Sjkim sltu $v0,$t0,$ta0 567238384Sjkim $ST $t0,-4*$BNSZ($a0) 568238384Sjkim $ADDU $v0,$t8 569238384Sjkim 570238384Sjkim $ADDU $ta1,$t1 571238384Sjkim sltu $t9,$ta1,$t1 572238384Sjkim $ADDU $t1,$ta1,$v0 573238384Sjkim sltu $v0,$t1,$ta1 574238384Sjkim $ST $t1,-3*$BNSZ($a0) 575238384Sjkim $ADDU $v0,$t9 576238384Sjkim 577238384Sjkim $ADDU $ta2,$t2 578238384Sjkim sltu $t8,$ta2,$t2 579238384Sjkim $ADDU $t2,$ta2,$v0 580238384Sjkim sltu $v0,$t2,$ta2 581238384Sjkim $ST $t2,-2*$BNSZ($a0) 582238384Sjkim $ADDU $v0,$t8 583238384Sjkim 584238384Sjkim $ADDU $ta3,$t3 585238384Sjkim sltu $t9,$ta3,$t3 586238384Sjkim $ADDU $t3,$ta3,$v0 587238384Sjkim sltu $v0,$t3,$ta3 588238384Sjkim $ST $t3,-$BNSZ($a0) 589238384Sjkim 590238384Sjkim .set noreorder 591264331Sjkim bgtz $at,.L_bn_add_words_loop 592264331Sjkim $ADDU $v0,$t9 593238384Sjkim 594238384Sjkim beqz $a3,.L_bn_add_words_return 595238384Sjkim nop 596238384Sjkim 597238384Sjkim.L_bn_add_words_tail: 598238384Sjkim .set reorder 599238384Sjkim $LD $t0,0($a1) 600238384Sjkim $LD $ta0,0($a2) 601238384Sjkim $ADDU $ta0,$t0 602238384Sjkim subu $a3,1 603238384Sjkim sltu $t8,$ta0,$t0 604238384Sjkim $ADDU $t0,$ta0,$v0 605238384Sjkim sltu $v0,$t0,$ta0 606238384Sjkim $ST $t0,0($a0) 607238384Sjkim $ADDU $v0,$t8 608238384Sjkim beqz $a3,.L_bn_add_words_return 609238384Sjkim 610238384Sjkim $LD $t1,$BNSZ($a1) 611238384Sjkim $LD $ta1,$BNSZ($a2) 612238384Sjkim $ADDU $ta1,$t1 613238384Sjkim subu $a3,1 614238384Sjkim sltu $t9,$ta1,$t1 615238384Sjkim $ADDU $t1,$ta1,$v0 616238384Sjkim sltu $v0,$t1,$ta1 617238384Sjkim $ST $t1,$BNSZ($a0) 618238384Sjkim $ADDU $v0,$t9 619238384Sjkim beqz $a3,.L_bn_add_words_return 620238384Sjkim 621238384Sjkim $LD $t2,2*$BNSZ($a1) 622238384Sjkim $LD $ta2,2*$BNSZ($a2) 623238384Sjkim $ADDU $ta2,$t2 624238384Sjkim sltu $t8,$ta2,$t2 625238384Sjkim $ADDU $t2,$ta2,$v0 626238384Sjkim sltu $v0,$t2,$ta2 627238384Sjkim $ST $t2,2*$BNSZ($a0) 628238384Sjkim $ADDU $v0,$t8 629238384Sjkim 630238384Sjkim.L_bn_add_words_return: 631238384Sjkim .set noreorder 632238384Sjkim___ 633238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 634238384Sjkim $REG_L $t3,4*$SZREG($sp) 635238384Sjkim $REG_L $t2,3*$SZREG($sp) 636238384Sjkim $REG_L $t1,2*$SZREG($sp) 637238384Sjkim $REG_L $t0,1*$SZREG($sp) 638238384Sjkim $REG_L $gp,0*$SZREG($sp) 639238384Sjkim $PTR_ADD $sp,6*$SZREG 640238384Sjkim___ 641238384Sjkim$code.=<<___; 642238384Sjkim jr $ra 643238384Sjkim move $a0,$v0 644238384Sjkim 645238384Sjkim.end bn_add_words_internal 646238384Sjkim 647238384Sjkim.align 5 648238384Sjkim.globl bn_sub_words 649238384Sjkim.ent bn_sub_words 650238384Sjkimbn_sub_words: 651238384Sjkim .set noreorder 652238384Sjkim bgtz $a3,bn_sub_words_internal 653238384Sjkim move $v0,$zero 654238384Sjkim jr $ra 655238384Sjkim move $a0,$zero 656238384Sjkim.end bn_sub_words 657238384Sjkim 658238384Sjkim.align 5 659238384Sjkim.ent bn_sub_words_internal 660238384Sjkimbn_sub_words_internal: 661238384Sjkim___ 662238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 663238384Sjkim .frame $sp,6*$SZREG,$ra 664238384Sjkim .mask 0x8000f008,-$SZREG 665238384Sjkim .set noreorder 666238384Sjkim $PTR_SUB $sp,6*$SZREG 667238384Sjkim $REG_S $ra,5*$SZREG($sp) 668238384Sjkim $REG_S $t3,4*$SZREG($sp) 669238384Sjkim $REG_S $t2,3*$SZREG($sp) 670238384Sjkim $REG_S $t1,2*$SZREG($sp) 671238384Sjkim $REG_S $t0,1*$SZREG($sp) 672238384Sjkim $REG_S $gp,0*$SZREG($sp) 673238384Sjkim___ 674238384Sjkim$code.=<<___; 675238384Sjkim .set reorder 676238384Sjkim li $minus4,-4 677238384Sjkim and $at,$a3,$minus4 678238384Sjkim beqz $at,.L_bn_sub_words_tail 679238384Sjkim 680238384Sjkim.L_bn_sub_words_loop: 681264331Sjkim $LD $t0,0($a1) 682238384Sjkim $LD $ta0,0($a2) 683238384Sjkim subu $a3,4 684238384Sjkim $LD $t1,$BNSZ($a1) 685238384Sjkim and $at,$a3,$minus4 686238384Sjkim $LD $t2,2*$BNSZ($a1) 687238384Sjkim $PTR_ADD $a2,4*$BNSZ 688238384Sjkim $LD $t3,3*$BNSZ($a1) 689238384Sjkim $PTR_ADD $a0,4*$BNSZ 690238384Sjkim $LD $ta1,-3*$BNSZ($a2) 691238384Sjkim $PTR_ADD $a1,4*$BNSZ 692238384Sjkim $LD $ta2,-2*$BNSZ($a2) 693238384Sjkim $LD $ta3,-$BNSZ($a2) 694238384Sjkim sltu $t8,$t0,$ta0 695238384Sjkim $SUBU $ta0,$t0,$ta0 696238384Sjkim $SUBU $t0,$ta0,$v0 697238384Sjkim sgtu $v0,$t0,$ta0 698238384Sjkim $ST $t0,-4*$BNSZ($a0) 699238384Sjkim $ADDU $v0,$t8 700238384Sjkim 701238384Sjkim sltu $t9,$t1,$ta1 702238384Sjkim $SUBU $ta1,$t1,$ta1 703238384Sjkim $SUBU $t1,$ta1,$v0 704238384Sjkim sgtu $v0,$t1,$ta1 705238384Sjkim $ST $t1,-3*$BNSZ($a0) 706238384Sjkim $ADDU $v0,$t9 707238384Sjkim 708238384Sjkim 709238384Sjkim sltu $t8,$t2,$ta2 710238384Sjkim $SUBU $ta2,$t2,$ta2 711238384Sjkim $SUBU $t2,$ta2,$v0 712238384Sjkim sgtu $v0,$t2,$ta2 713238384Sjkim $ST $t2,-2*$BNSZ($a0) 714238384Sjkim $ADDU $v0,$t8 715238384Sjkim 716238384Sjkim sltu $t9,$t3,$ta3 717238384Sjkim $SUBU $ta3,$t3,$ta3 718238384Sjkim $SUBU $t3,$ta3,$v0 719238384Sjkim sgtu $v0,$t3,$ta3 720238384Sjkim $ST $t3,-$BNSZ($a0) 721238384Sjkim 722238384Sjkim .set noreorder 723264331Sjkim bgtz $at,.L_bn_sub_words_loop 724264331Sjkim $ADDU $v0,$t9 725238384Sjkim 726238384Sjkim beqz $a3,.L_bn_sub_words_return 727238384Sjkim nop 728238384Sjkim 729238384Sjkim.L_bn_sub_words_tail: 730238384Sjkim .set reorder 731238384Sjkim $LD $t0,0($a1) 732238384Sjkim $LD $ta0,0($a2) 733238384Sjkim subu $a3,1 734238384Sjkim sltu $t8,$t0,$ta0 735238384Sjkim $SUBU $ta0,$t0,$ta0 736238384Sjkim $SUBU $t0,$ta0,$v0 737238384Sjkim sgtu $v0,$t0,$ta0 738238384Sjkim $ST $t0,0($a0) 739238384Sjkim $ADDU $v0,$t8 740238384Sjkim beqz $a3,.L_bn_sub_words_return 741238384Sjkim 742238384Sjkim $LD $t1,$BNSZ($a1) 743238384Sjkim subu $a3,1 744238384Sjkim $LD $ta1,$BNSZ($a2) 745238384Sjkim sltu $t9,$t1,$ta1 746238384Sjkim $SUBU $ta1,$t1,$ta1 747238384Sjkim $SUBU $t1,$ta1,$v0 748238384Sjkim sgtu $v0,$t1,$ta1 749238384Sjkim $ST $t1,$BNSZ($a0) 750238384Sjkim $ADDU $v0,$t9 751238384Sjkim beqz $a3,.L_bn_sub_words_return 752238384Sjkim 753238384Sjkim $LD $t2,2*$BNSZ($a1) 754238384Sjkim $LD $ta2,2*$BNSZ($a2) 755238384Sjkim sltu $t8,$t2,$ta2 756238384Sjkim $SUBU $ta2,$t2,$ta2 757238384Sjkim $SUBU $t2,$ta2,$v0 758238384Sjkim sgtu $v0,$t2,$ta2 759238384Sjkim $ST $t2,2*$BNSZ($a0) 760238384Sjkim $ADDU $v0,$t8 761238384Sjkim 762238384Sjkim.L_bn_sub_words_return: 763238384Sjkim .set noreorder 764238384Sjkim___ 765238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 766238384Sjkim $REG_L $t3,4*$SZREG($sp) 767238384Sjkim $REG_L $t2,3*$SZREG($sp) 768238384Sjkim $REG_L $t1,2*$SZREG($sp) 769238384Sjkim $REG_L $t0,1*$SZREG($sp) 770238384Sjkim $REG_L $gp,0*$SZREG($sp) 771238384Sjkim $PTR_ADD $sp,6*$SZREG 772238384Sjkim___ 773238384Sjkim$code.=<<___; 774238384Sjkim jr $ra 775238384Sjkim move $a0,$v0 776238384Sjkim.end bn_sub_words_internal 777238384Sjkim 778238384Sjkim.align 5 779238384Sjkim.globl bn_div_3_words 780238384Sjkim.ent bn_div_3_words 781238384Sjkimbn_div_3_words: 782238384Sjkim .set noreorder 783238384Sjkim move $a3,$a0 # we know that bn_div_words does not 784238384Sjkim # touch $a3, $ta2, $ta3 and preserves $a2 785238384Sjkim # so that we can save two arguments 786238384Sjkim # and return address in registers 787238384Sjkim # instead of stack:-) 788238384Sjkim 789238384Sjkim $LD $a0,($a3) 790238384Sjkim move $ta2,$a1 791238384Sjkim bne $a0,$a2,bn_div_3_words_internal 792238384Sjkim $LD $a1,-$BNSZ($a3) 793238384Sjkim li $v0,-1 794238384Sjkim jr $ra 795238384Sjkim move $a0,$v0 796238384Sjkim.end bn_div_3_words 797238384Sjkim 798238384Sjkim.align 5 799238384Sjkim.ent bn_div_3_words_internal 800238384Sjkimbn_div_3_words_internal: 801238384Sjkim___ 802238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 803238384Sjkim .frame $sp,6*$SZREG,$ra 804238384Sjkim .mask 0x8000f008,-$SZREG 805238384Sjkim .set noreorder 806238384Sjkim $PTR_SUB $sp,6*$SZREG 807238384Sjkim $REG_S $ra,5*$SZREG($sp) 808238384Sjkim $REG_S $t3,4*$SZREG($sp) 809238384Sjkim $REG_S $t2,3*$SZREG($sp) 810238384Sjkim $REG_S $t1,2*$SZREG($sp) 811238384Sjkim $REG_S $t0,1*$SZREG($sp) 812238384Sjkim $REG_S $gp,0*$SZREG($sp) 813238384Sjkim___ 814238384Sjkim$code.=<<___; 815238384Sjkim .set reorder 816238384Sjkim move $ta3,$ra 817246772Sjkim bal bn_div_words_internal 818238384Sjkim move $ra,$ta3 819238384Sjkim $MULTU $ta2,$v0 820238384Sjkim $LD $t2,-2*$BNSZ($a3) 821238384Sjkim move $ta0,$zero 822238384Sjkim mfhi $t1 823238384Sjkim mflo $t0 824238384Sjkim sltu $t8,$t1,$a1 825238384Sjkim.L_bn_div_3_words_inner_loop: 826238384Sjkim bnez $t8,.L_bn_div_3_words_inner_loop_done 827238384Sjkim sgeu $at,$t2,$t0 828238384Sjkim seq $t9,$t1,$a1 829238384Sjkim and $at,$t9 830238384Sjkim sltu $t3,$t0,$ta2 831238384Sjkim $ADDU $a1,$a2 832238384Sjkim $SUBU $t1,$t3 833238384Sjkim $SUBU $t0,$ta2 834238384Sjkim sltu $t8,$t1,$a1 835238384Sjkim sltu $ta0,$a1,$a2 836238384Sjkim or $t8,$ta0 837238384Sjkim .set noreorder 838264331Sjkim beqz $at,.L_bn_div_3_words_inner_loop 839238384Sjkim $SUBU $v0,1 840264331Sjkim $ADDU $v0,1 841238384Sjkim .set reorder 842238384Sjkim.L_bn_div_3_words_inner_loop_done: 843238384Sjkim .set noreorder 844238384Sjkim___ 845238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 846238384Sjkim $REG_L $t3,4*$SZREG($sp) 847238384Sjkim $REG_L $t2,3*$SZREG($sp) 848238384Sjkim $REG_L $t1,2*$SZREG($sp) 849238384Sjkim $REG_L $t0,1*$SZREG($sp) 850238384Sjkim $REG_L $gp,0*$SZREG($sp) 851238384Sjkim $PTR_ADD $sp,6*$SZREG 852238384Sjkim___ 853238384Sjkim$code.=<<___; 854238384Sjkim jr $ra 855238384Sjkim move $a0,$v0 856238384Sjkim.end bn_div_3_words_internal 857238384Sjkim 858238384Sjkim.align 5 859238384Sjkim.globl bn_div_words 860238384Sjkim.ent bn_div_words 861238384Sjkimbn_div_words: 862238384Sjkim .set noreorder 863238384Sjkim bnez $a2,bn_div_words_internal 864238384Sjkim li $v0,-1 # I would rather signal div-by-zero 865238384Sjkim # which can be done with 'break 7' 866238384Sjkim jr $ra 867238384Sjkim move $a0,$v0 868238384Sjkim.end bn_div_words 869238384Sjkim 870238384Sjkim.align 5 871238384Sjkim.ent bn_div_words_internal 872238384Sjkimbn_div_words_internal: 873238384Sjkim___ 874238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 875238384Sjkim .frame $sp,6*$SZREG,$ra 876238384Sjkim .mask 0x8000f008,-$SZREG 877238384Sjkim .set noreorder 878238384Sjkim $PTR_SUB $sp,6*$SZREG 879238384Sjkim $REG_S $ra,5*$SZREG($sp) 880238384Sjkim $REG_S $t3,4*$SZREG($sp) 881238384Sjkim $REG_S $t2,3*$SZREG($sp) 882238384Sjkim $REG_S $t1,2*$SZREG($sp) 883238384Sjkim $REG_S $t0,1*$SZREG($sp) 884238384Sjkim $REG_S $gp,0*$SZREG($sp) 885238384Sjkim___ 886238384Sjkim$code.=<<___; 887238384Sjkim move $v1,$zero 888238384Sjkim bltz $a2,.L_bn_div_words_body 889238384Sjkim move $t9,$v1 890238384Sjkim $SLL $a2,1 891238384Sjkim bgtz $a2,.-4 892238384Sjkim addu $t9,1 893238384Sjkim 894238384Sjkim .set reorder 895238384Sjkim negu $t1,$t9 896238384Sjkim li $t2,-1 897238384Sjkim $SLL $t2,$t1 898238384Sjkim and $t2,$a0 899238384Sjkim $SRL $at,$a1,$t1 900238384Sjkim .set noreorder 901264331Sjkim beqz $t2,.+12 902264331Sjkim nop 903238384Sjkim break 6 # signal overflow 904238384Sjkim .set reorder 905238384Sjkim $SLL $a0,$t9 906238384Sjkim $SLL $a1,$t9 907238384Sjkim or $a0,$at 908238384Sjkim___ 909238384Sjkim$QT=$ta0; 910238384Sjkim$HH=$ta1; 911238384Sjkim$DH=$v1; 912238384Sjkim$code.=<<___; 913238384Sjkim.L_bn_div_words_body: 914238384Sjkim $SRL $DH,$a2,4*$BNSZ # bits 915238384Sjkim sgeu $at,$a0,$a2 916238384Sjkim .set noreorder 917264331Sjkim beqz $at,.+12 918264331Sjkim nop 919238384Sjkim $SUBU $a0,$a2 920238384Sjkim .set reorder 921238384Sjkim 922238384Sjkim li $QT,-1 923238384Sjkim $SRL $HH,$a0,4*$BNSZ # bits 924238384Sjkim $SRL $QT,4*$BNSZ # q=0xffffffff 925238384Sjkim beq $DH,$HH,.L_bn_div_words_skip_div1 926238384Sjkim $DIVU $zero,$a0,$DH 927238384Sjkim mflo $QT 928238384Sjkim.L_bn_div_words_skip_div1: 929238384Sjkim $MULTU $a2,$QT 930238384Sjkim $SLL $t3,$a0,4*$BNSZ # bits 931238384Sjkim $SRL $at,$a1,4*$BNSZ # bits 932238384Sjkim or $t3,$at 933238384Sjkim mflo $t0 934238384Sjkim mfhi $t1 935238384Sjkim.L_bn_div_words_inner_loop1: 936238384Sjkim sltu $t2,$t3,$t0 937238384Sjkim seq $t8,$HH,$t1 938238384Sjkim sltu $at,$HH,$t1 939238384Sjkim and $t2,$t8 940238384Sjkim sltu $v0,$t0,$a2 941238384Sjkim or $at,$t2 942238384Sjkim .set noreorder 943238384Sjkim beqz $at,.L_bn_div_words_inner_loop1_done 944238384Sjkim $SUBU $t1,$v0 945238384Sjkim $SUBU $t0,$a2 946238384Sjkim b .L_bn_div_words_inner_loop1 947238384Sjkim $SUBU $QT,1 948238384Sjkim .set reorder 949238384Sjkim.L_bn_div_words_inner_loop1_done: 950238384Sjkim 951238384Sjkim $SLL $a1,4*$BNSZ # bits 952238384Sjkim $SUBU $a0,$t3,$t0 953238384Sjkim $SLL $v0,$QT,4*$BNSZ # bits 954238384Sjkim 955238384Sjkim li $QT,-1 956238384Sjkim $SRL $HH,$a0,4*$BNSZ # bits 957238384Sjkim $SRL $QT,4*$BNSZ # q=0xffffffff 958238384Sjkim beq $DH,$HH,.L_bn_div_words_skip_div2 959238384Sjkim $DIVU $zero,$a0,$DH 960238384Sjkim mflo $QT 961238384Sjkim.L_bn_div_words_skip_div2: 962238384Sjkim $MULTU $a2,$QT 963238384Sjkim $SLL $t3,$a0,4*$BNSZ # bits 964238384Sjkim $SRL $at,$a1,4*$BNSZ # bits 965238384Sjkim or $t3,$at 966238384Sjkim mflo $t0 967238384Sjkim mfhi $t1 968238384Sjkim.L_bn_div_words_inner_loop2: 969238384Sjkim sltu $t2,$t3,$t0 970238384Sjkim seq $t8,$HH,$t1 971238384Sjkim sltu $at,$HH,$t1 972238384Sjkim and $t2,$t8 973238384Sjkim sltu $v1,$t0,$a2 974238384Sjkim or $at,$t2 975238384Sjkim .set noreorder 976238384Sjkim beqz $at,.L_bn_div_words_inner_loop2_done 977238384Sjkim $SUBU $t1,$v1 978238384Sjkim $SUBU $t0,$a2 979238384Sjkim b .L_bn_div_words_inner_loop2 980238384Sjkim $SUBU $QT,1 981238384Sjkim .set reorder 982238384Sjkim.L_bn_div_words_inner_loop2_done: 983238384Sjkim 984238384Sjkim $SUBU $a0,$t3,$t0 985238384Sjkim or $v0,$QT 986238384Sjkim $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it 987238384Sjkim $SRL $a2,$t9 # restore $a2 988238384Sjkim 989238384Sjkim .set noreorder 990238384Sjkim move $a1,$v1 991238384Sjkim___ 992238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 993238384Sjkim $REG_L $t3,4*$SZREG($sp) 994238384Sjkim $REG_L $t2,3*$SZREG($sp) 995238384Sjkim $REG_L $t1,2*$SZREG($sp) 996238384Sjkim $REG_L $t0,1*$SZREG($sp) 997238384Sjkim $REG_L $gp,0*$SZREG($sp) 998238384Sjkim $PTR_ADD $sp,6*$SZREG 999238384Sjkim___ 1000238384Sjkim$code.=<<___; 1001238384Sjkim jr $ra 1002238384Sjkim move $a0,$v0 1003238384Sjkim.end bn_div_words_internal 1004238384Sjkim___ 1005238384Sjkimundef $HH; undef $QT; undef $DH; 1006238384Sjkim 1007238384Sjkim($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); 1008238384Sjkim($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); 1009238384Sjkim 1010238384Sjkim($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 1011238384Sjkim($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 1012238384Sjkim 1013238384Sjkim($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); 1014238384Sjkim 1015238384Sjkim$code.=<<___; 1016238384Sjkim 1017238384Sjkim.align 5 1018238384Sjkim.globl bn_mul_comba8 1019238384Sjkim.ent bn_mul_comba8 1020238384Sjkimbn_mul_comba8: 1021238384Sjkim .set noreorder 1022238384Sjkim___ 1023238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 1024238384Sjkim .frame $sp,12*$SZREG,$ra 1025238384Sjkim .mask 0x803ff008,-$SZREG 1026238384Sjkim $PTR_SUB $sp,12*$SZREG 1027238384Sjkim $REG_S $ra,11*$SZREG($sp) 1028238384Sjkim $REG_S $s5,10*$SZREG($sp) 1029238384Sjkim $REG_S $s4,9*$SZREG($sp) 1030238384Sjkim $REG_S $s3,8*$SZREG($sp) 1031238384Sjkim $REG_S $s2,7*$SZREG($sp) 1032238384Sjkim $REG_S $s1,6*$SZREG($sp) 1033238384Sjkim $REG_S $s0,5*$SZREG($sp) 1034238384Sjkim $REG_S $t3,4*$SZREG($sp) 1035238384Sjkim $REG_S $t2,3*$SZREG($sp) 1036238384Sjkim $REG_S $t1,2*$SZREG($sp) 1037238384Sjkim $REG_S $t0,1*$SZREG($sp) 1038238384Sjkim $REG_S $gp,0*$SZREG($sp) 1039238384Sjkim___ 1040238384Sjkim$code.=<<___ if ($flavour !~ /nubi/i); 1041238384Sjkim .frame $sp,6*$SZREG,$ra 1042238384Sjkim .mask 0x003f0000,-$SZREG 1043238384Sjkim $PTR_SUB $sp,6*$SZREG 1044238384Sjkim $REG_S $s5,5*$SZREG($sp) 1045238384Sjkim $REG_S $s4,4*$SZREG($sp) 1046238384Sjkim $REG_S $s3,3*$SZREG($sp) 1047238384Sjkim $REG_S $s2,2*$SZREG($sp) 1048238384Sjkim $REG_S $s1,1*$SZREG($sp) 1049238384Sjkim $REG_S $s0,0*$SZREG($sp) 1050238384Sjkim___ 1051238384Sjkim$code.=<<___; 1052238384Sjkim 1053238384Sjkim .set reorder 1054238384Sjkim $LD $a_0,0($a1) # If compiled with -mips3 option on 1055238384Sjkim # R5000 box assembler barks on this 1056238384Sjkim # 1ine with "should not have mult/div 1057238384Sjkim # as last instruction in bb (R10K 1058238384Sjkim # bug)" warning. If anybody out there 1059238384Sjkim # has a clue about how to circumvent 1060238384Sjkim # this do send me a note. 1061238384Sjkim # <appro\@fy.chalmers.se> 1062238384Sjkim 1063238384Sjkim $LD $b_0,0($a2) 1064238384Sjkim $LD $a_1,$BNSZ($a1) 1065238384Sjkim $LD $a_2,2*$BNSZ($a1) 1066238384Sjkim $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); 1067238384Sjkim $LD $a_3,3*$BNSZ($a1) 1068238384Sjkim $LD $b_1,$BNSZ($a2) 1069238384Sjkim $LD $b_2,2*$BNSZ($a2) 1070238384Sjkim $LD $b_3,3*$BNSZ($a2) 1071238384Sjkim mflo $c_1 1072238384Sjkim mfhi $c_2 1073238384Sjkim 1074238384Sjkim $LD $a_4,4*$BNSZ($a1) 1075238384Sjkim $LD $a_5,5*$BNSZ($a1) 1076238384Sjkim $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); 1077238384Sjkim $LD $a_6,6*$BNSZ($a1) 1078238384Sjkim $LD $a_7,7*$BNSZ($a1) 1079238384Sjkim $LD $b_4,4*$BNSZ($a2) 1080238384Sjkim $LD $b_5,5*$BNSZ($a2) 1081238384Sjkim mflo $t_1 1082238384Sjkim mfhi $t_2 1083238384Sjkim $ADDU $c_2,$t_1 1084238384Sjkim sltu $at,$c_2,$t_1 1085238384Sjkim $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); 1086238384Sjkim $ADDU $c_3,$t_2,$at 1087238384Sjkim $LD $b_6,6*$BNSZ($a2) 1088238384Sjkim $LD $b_7,7*$BNSZ($a2) 1089238384Sjkim $ST $c_1,0($a0) # r[0]=c1; 1090238384Sjkim mflo $t_1 1091238384Sjkim mfhi $t_2 1092238384Sjkim $ADDU $c_2,$t_1 1093238384Sjkim sltu $at,$c_2,$t_1 1094238384Sjkim $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); 1095238384Sjkim $ADDU $t_2,$at 1096238384Sjkim $ADDU $c_3,$t_2 1097238384Sjkim sltu $c_1,$c_3,$t_2 1098238384Sjkim $ST $c_2,$BNSZ($a0) # r[1]=c2; 1099238384Sjkim 1100238384Sjkim mflo $t_1 1101238384Sjkim mfhi $t_2 1102238384Sjkim $ADDU $c_3,$t_1 1103238384Sjkim sltu $at,$c_3,$t_1 1104238384Sjkim $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); 1105238384Sjkim $ADDU $t_2,$at 1106238384Sjkim $ADDU $c_1,$t_2 1107238384Sjkim mflo $t_1 1108238384Sjkim mfhi $t_2 1109238384Sjkim $ADDU $c_3,$t_1 1110238384Sjkim sltu $at,$c_3,$t_1 1111238384Sjkim $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); 1112238384Sjkim $ADDU $t_2,$at 1113238384Sjkim $ADDU $c_1,$t_2 1114238384Sjkim sltu $c_2,$c_1,$t_2 1115238384Sjkim mflo $t_1 1116238384Sjkim mfhi $t_2 1117238384Sjkim $ADDU $c_3,$t_1 1118238384Sjkim sltu $at,$c_3,$t_1 1119238384Sjkim $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); 1120238384Sjkim $ADDU $t_2,$at 1121238384Sjkim $ADDU $c_1,$t_2 1122238384Sjkim sltu $at,$c_1,$t_2 1123238384Sjkim $ADDU $c_2,$at 1124238384Sjkim $ST $c_3,2*$BNSZ($a0) # r[2]=c3; 1125238384Sjkim 1126238384Sjkim mflo $t_1 1127238384Sjkim mfhi $t_2 1128238384Sjkim $ADDU $c_1,$t_1 1129238384Sjkim sltu $at,$c_1,$t_1 1130238384Sjkim $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); 1131238384Sjkim $ADDU $t_2,$at 1132238384Sjkim $ADDU $c_2,$t_2 1133238384Sjkim sltu $c_3,$c_2,$t_2 1134238384Sjkim mflo $t_1 1135238384Sjkim mfhi $t_2 1136238384Sjkim $ADDU $c_1,$t_1 1137238384Sjkim sltu $at,$c_1,$t_1 1138238384Sjkim $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); 1139238384Sjkim $ADDU $t_2,$at 1140238384Sjkim $ADDU $c_2,$t_2 1141238384Sjkim sltu $at,$c_2,$t_2 1142238384Sjkim $ADDU $c_3,$at 1143238384Sjkim mflo $t_1 1144238384Sjkim mfhi $t_2 1145238384Sjkim $ADDU $c_1,$t_1 1146238384Sjkim sltu $at,$c_1,$t_1 1147238384Sjkim $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); 1148238384Sjkim $ADDU $t_2,$at 1149238384Sjkim $ADDU $c_2,$t_2 1150238384Sjkim sltu $at,$c_2,$t_2 1151238384Sjkim $ADDU $c_3,$at 1152238384Sjkim mflo $t_1 1153238384Sjkim mfhi $t_2 1154238384Sjkim $ADDU $c_1,$t_1 1155238384Sjkim sltu $at,$c_1,$t_1 1156238384Sjkim $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); 1157238384Sjkim $ADDU $t_2,$at 1158238384Sjkim $ADDU $c_2,$t_2 1159238384Sjkim sltu $at,$c_2,$t_2 1160238384Sjkim $ADDU $c_3,$at 1161238384Sjkim $ST $c_1,3*$BNSZ($a0) # r[3]=c1; 1162238384Sjkim 1163238384Sjkim mflo $t_1 1164238384Sjkim mfhi $t_2 1165238384Sjkim $ADDU $c_2,$t_1 1166238384Sjkim sltu $at,$c_2,$t_1 1167238384Sjkim $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); 1168238384Sjkim $ADDU $t_2,$at 1169238384Sjkim $ADDU $c_3,$t_2 1170238384Sjkim sltu $c_1,$c_3,$t_2 1171238384Sjkim mflo $t_1 1172238384Sjkim mfhi $t_2 1173238384Sjkim $ADDU $c_2,$t_1 1174238384Sjkim sltu $at,$c_2,$t_1 1175238384Sjkim $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); 1176238384Sjkim $ADDU $t_2,$at 1177238384Sjkim $ADDU $c_3,$t_2 1178238384Sjkim sltu $at,$c_3,$t_2 1179238384Sjkim $ADDU $c_1,$at 1180238384Sjkim mflo $t_1 1181238384Sjkim mfhi $t_2 1182238384Sjkim $ADDU $c_2,$t_1 1183238384Sjkim sltu $at,$c_2,$t_1 1184238384Sjkim $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); 1185238384Sjkim $ADDU $t_2,$at 1186238384Sjkim $ADDU $c_3,$t_2 1187238384Sjkim sltu $at,$c_3,$t_2 1188238384Sjkim $ADDU $c_1,$at 1189238384Sjkim mflo $t_1 1190238384Sjkim mfhi $t_2 1191238384Sjkim $ADDU $c_2,$t_1 1192238384Sjkim sltu $at,$c_2,$t_1 1193238384Sjkim $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); 1194238384Sjkim $ADDU $t_2,$at 1195238384Sjkim $ADDU $c_3,$t_2 1196238384Sjkim sltu $at,$c_3,$t_2 1197238384Sjkim $ADDU $c_1,$at 1198238384Sjkim mflo $t_1 1199238384Sjkim mfhi $t_2 1200238384Sjkim $ADDU $c_2,$t_1 1201238384Sjkim sltu $at,$c_2,$t_1 1202238384Sjkim $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); 1203238384Sjkim $ADDU $t_2,$at 1204238384Sjkim $ADDU $c_3,$t_2 1205238384Sjkim sltu $at,$c_3,$t_2 1206238384Sjkim $ADDU $c_1,$at 1207238384Sjkim $ST $c_2,4*$BNSZ($a0) # r[4]=c2; 1208238384Sjkim 1209238384Sjkim mflo $t_1 1210238384Sjkim mfhi $t_2 1211238384Sjkim $ADDU $c_3,$t_1 1212238384Sjkim sltu $at,$c_3,$t_1 1213238384Sjkim $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); 1214238384Sjkim $ADDU $t_2,$at 1215238384Sjkim $ADDU $c_1,$t_2 1216238384Sjkim sltu $c_2,$c_1,$t_2 1217238384Sjkim mflo $t_1 1218238384Sjkim mfhi $t_2 1219238384Sjkim $ADDU $c_3,$t_1 1220238384Sjkim sltu $at,$c_3,$t_1 1221238384Sjkim $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); 1222238384Sjkim $ADDU $t_2,$at 1223238384Sjkim $ADDU $c_1,$t_2 1224238384Sjkim sltu $at,$c_1,$t_2 1225238384Sjkim $ADDU $c_2,$at 1226238384Sjkim mflo $t_1 1227238384Sjkim mfhi $t_2 1228238384Sjkim $ADDU $c_3,$t_1 1229238384Sjkim sltu $at,$c_3,$t_1 1230238384Sjkim $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); 1231238384Sjkim $ADDU $t_2,$at 1232238384Sjkim $ADDU $c_1,$t_2 1233238384Sjkim sltu $at,$c_1,$t_2 1234238384Sjkim $ADDU $c_2,$at 1235238384Sjkim mflo $t_1 1236238384Sjkim mfhi $t_2 1237238384Sjkim $ADDU $c_3,$t_1 1238238384Sjkim sltu $at,$c_3,$t_1 1239238384Sjkim $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); 1240238384Sjkim $ADDU $t_2,$at 1241238384Sjkim $ADDU $c_1,$t_2 1242238384Sjkim sltu $at,$c_1,$t_2 1243238384Sjkim $ADDU $c_2,$at 1244238384Sjkim mflo $t_1 1245238384Sjkim mfhi $t_2 1246238384Sjkim $ADDU $c_3,$t_1 1247238384Sjkim sltu $at,$c_3,$t_1 1248238384Sjkim $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); 1249238384Sjkim $ADDU $t_2,$at 1250238384Sjkim $ADDU $c_1,$t_2 1251238384Sjkim sltu $at,$c_1,$t_2 1252238384Sjkim $ADDU $c_2,$at 1253238384Sjkim mflo $t_1 1254238384Sjkim mfhi $t_2 1255238384Sjkim $ADDU $c_3,$t_1 1256238384Sjkim sltu $at,$c_3,$t_1 1257238384Sjkim $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); 1258238384Sjkim $ADDU $t_2,$at 1259238384Sjkim $ADDU $c_1,$t_2 1260238384Sjkim sltu $at,$c_1,$t_2 1261238384Sjkim $ADDU $c_2,$at 1262238384Sjkim $ST $c_3,5*$BNSZ($a0) # r[5]=c3; 1263238384Sjkim 1264238384Sjkim mflo $t_1 1265238384Sjkim mfhi $t_2 1266238384Sjkim $ADDU $c_1,$t_1 1267238384Sjkim sltu $at,$c_1,$t_1 1268238384Sjkim $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); 1269238384Sjkim $ADDU $t_2,$at 1270238384Sjkim $ADDU $c_2,$t_2 1271238384Sjkim sltu $c_3,$c_2,$t_2 1272238384Sjkim mflo $t_1 1273238384Sjkim mfhi $t_2 1274238384Sjkim $ADDU $c_1,$t_1 1275238384Sjkim sltu $at,$c_1,$t_1 1276238384Sjkim $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); 1277238384Sjkim $ADDU $t_2,$at 1278238384Sjkim $ADDU $c_2,$t_2 1279238384Sjkim sltu $at,$c_2,$t_2 1280238384Sjkim $ADDU $c_3,$at 1281238384Sjkim mflo $t_1 1282238384Sjkim mfhi $t_2 1283238384Sjkim $ADDU $c_1,$t_1 1284238384Sjkim sltu $at,$c_1,$t_1 1285238384Sjkim $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); 1286238384Sjkim $ADDU $t_2,$at 1287238384Sjkim $ADDU $c_2,$t_2 1288238384Sjkim sltu $at,$c_2,$t_2 1289238384Sjkim $ADDU $c_3,$at 1290238384Sjkim mflo $t_1 1291238384Sjkim mfhi $t_2 1292238384Sjkim $ADDU $c_1,$t_1 1293238384Sjkim sltu $at,$c_1,$t_1 1294238384Sjkim $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); 1295238384Sjkim $ADDU $t_2,$at 1296238384Sjkim $ADDU $c_2,$t_2 1297238384Sjkim sltu $at,$c_2,$t_2 1298238384Sjkim $ADDU $c_3,$at 1299238384Sjkim mflo $t_1 1300238384Sjkim mfhi $t_2 1301238384Sjkim $ADDU $c_1,$t_1 1302238384Sjkim sltu $at,$c_1,$t_1 1303238384Sjkim $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); 1304238384Sjkim $ADDU $t_2,$at 1305238384Sjkim $ADDU $c_2,$t_2 1306238384Sjkim sltu $at,$c_2,$t_2 1307238384Sjkim $ADDU $c_3,$at 1308238384Sjkim mflo $t_1 1309238384Sjkim mfhi $t_2 1310238384Sjkim $ADDU $c_1,$t_1 1311238384Sjkim sltu $at,$c_1,$t_1 1312238384Sjkim $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); 1313238384Sjkim $ADDU $t_2,$at 1314238384Sjkim $ADDU $c_2,$t_2 1315238384Sjkim sltu $at,$c_2,$t_2 1316238384Sjkim $ADDU $c_3,$at 1317238384Sjkim mflo $t_1 1318238384Sjkim mfhi $t_2 1319238384Sjkim $ADDU $c_1,$t_1 1320238384Sjkim sltu $at,$c_1,$t_1 1321238384Sjkim $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); 1322238384Sjkim $ADDU $t_2,$at 1323238384Sjkim $ADDU $c_2,$t_2 1324238384Sjkim sltu $at,$c_2,$t_2 1325238384Sjkim $ADDU $c_3,$at 1326238384Sjkim $ST $c_1,6*$BNSZ($a0) # r[6]=c1; 1327238384Sjkim 1328238384Sjkim mflo $t_1 1329238384Sjkim mfhi $t_2 1330238384Sjkim $ADDU $c_2,$t_1 1331238384Sjkim sltu $at,$c_2,$t_1 1332238384Sjkim $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); 1333238384Sjkim $ADDU $t_2,$at 1334238384Sjkim $ADDU $c_3,$t_2 1335238384Sjkim sltu $c_1,$c_3,$t_2 1336238384Sjkim mflo $t_1 1337238384Sjkim mfhi $t_2 1338238384Sjkim $ADDU $c_2,$t_1 1339238384Sjkim sltu $at,$c_2,$t_1 1340238384Sjkim $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); 1341238384Sjkim $ADDU $t_2,$at 1342238384Sjkim $ADDU $c_3,$t_2 1343238384Sjkim sltu $at,$c_3,$t_2 1344238384Sjkim $ADDU $c_1,$at 1345238384Sjkim mflo $t_1 1346238384Sjkim mfhi $t_2 1347238384Sjkim $ADDU $c_2,$t_1 1348238384Sjkim sltu $at,$c_2,$t_1 1349238384Sjkim $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); 1350238384Sjkim $ADDU $t_2,$at 1351238384Sjkim $ADDU $c_3,$t_2 1352238384Sjkim sltu $at,$c_3,$t_2 1353238384Sjkim $ADDU $c_1,$at 1354238384Sjkim mflo $t_1 1355238384Sjkim mfhi $t_2 1356238384Sjkim $ADDU $c_2,$t_1 1357238384Sjkim sltu $at,$c_2,$t_1 1358238384Sjkim $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); 1359238384Sjkim $ADDU $t_2,$at 1360238384Sjkim $ADDU $c_3,$t_2 1361238384Sjkim sltu $at,$c_3,$t_2 1362238384Sjkim $ADDU $c_1,$at 1363238384Sjkim mflo $t_1 1364238384Sjkim mfhi $t_2 1365238384Sjkim $ADDU $c_2,$t_1 1366238384Sjkim sltu $at,$c_2,$t_1 1367238384Sjkim $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); 1368238384Sjkim $ADDU $t_2,$at 1369238384Sjkim $ADDU $c_3,$t_2 1370238384Sjkim sltu $at,$c_3,$t_2 1371238384Sjkim $ADDU $c_1,$at 1372238384Sjkim mflo $t_1 1373238384Sjkim mfhi $t_2 1374238384Sjkim $ADDU $c_2,$t_1 1375238384Sjkim sltu $at,$c_2,$t_1 1376238384Sjkim $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); 1377238384Sjkim $ADDU $t_2,$at 1378238384Sjkim $ADDU $c_3,$t_2 1379238384Sjkim sltu $at,$c_3,$t_2 1380238384Sjkim $ADDU $c_1,$at 1381238384Sjkim mflo $t_1 1382238384Sjkim mfhi $t_2 1383238384Sjkim $ADDU $c_2,$t_1 1384238384Sjkim sltu $at,$c_2,$t_1 1385238384Sjkim $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); 1386238384Sjkim $ADDU $t_2,$at 1387238384Sjkim $ADDU $c_3,$t_2 1388238384Sjkim sltu $at,$c_3,$t_2 1389238384Sjkim $ADDU $c_1,$at 1390238384Sjkim mflo $t_1 1391238384Sjkim mfhi $t_2 1392238384Sjkim $ADDU $c_2,$t_1 1393238384Sjkim sltu $at,$c_2,$t_1 1394238384Sjkim $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); 1395238384Sjkim $ADDU $t_2,$at 1396238384Sjkim $ADDU $c_3,$t_2 1397238384Sjkim sltu $at,$c_3,$t_2 1398238384Sjkim $ADDU $c_1,$at 1399238384Sjkim $ST $c_2,7*$BNSZ($a0) # r[7]=c2; 1400238384Sjkim 1401238384Sjkim mflo $t_1 1402238384Sjkim mfhi $t_2 1403238384Sjkim $ADDU $c_3,$t_1 1404238384Sjkim sltu $at,$c_3,$t_1 1405238384Sjkim $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); 1406238384Sjkim $ADDU $t_2,$at 1407238384Sjkim $ADDU $c_1,$t_2 1408238384Sjkim sltu $c_2,$c_1,$t_2 1409238384Sjkim mflo $t_1 1410238384Sjkim mfhi $t_2 1411238384Sjkim $ADDU $c_3,$t_1 1412238384Sjkim sltu $at,$c_3,$t_1 1413238384Sjkim $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); 1414238384Sjkim $ADDU $t_2,$at 1415238384Sjkim $ADDU $c_1,$t_2 1416238384Sjkim sltu $at,$c_1,$t_2 1417238384Sjkim $ADDU $c_2,$at 1418238384Sjkim mflo $t_1 1419238384Sjkim mfhi $t_2 1420238384Sjkim $ADDU $c_3,$t_1 1421238384Sjkim sltu $at,$c_3,$t_1 1422238384Sjkim $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); 1423238384Sjkim $ADDU $t_2,$at 1424238384Sjkim $ADDU $c_1,$t_2 1425238384Sjkim sltu $at,$c_1,$t_2 1426238384Sjkim $ADDU $c_2,$at 1427238384Sjkim mflo $t_1 1428238384Sjkim mfhi $t_2 1429238384Sjkim $ADDU $c_3,$t_1 1430238384Sjkim sltu $at,$c_3,$t_1 1431238384Sjkim $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); 1432238384Sjkim $ADDU $t_2,$at 1433238384Sjkim $ADDU $c_1,$t_2 1434238384Sjkim sltu $at,$c_1,$t_2 1435238384Sjkim $ADDU $c_2,$at 1436238384Sjkim mflo $t_1 1437238384Sjkim mfhi $t_2 1438238384Sjkim $ADDU $c_3,$t_1 1439238384Sjkim sltu $at,$c_3,$t_1 1440238384Sjkim $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); 1441238384Sjkim $ADDU $t_2,$at 1442238384Sjkim $ADDU $c_1,$t_2 1443238384Sjkim sltu $at,$c_1,$t_2 1444238384Sjkim $ADDU $c_2,$at 1445238384Sjkim mflo $t_1 1446238384Sjkim mfhi $t_2 1447238384Sjkim $ADDU $c_3,$t_1 1448238384Sjkim sltu $at,$c_3,$t_1 1449238384Sjkim $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); 1450238384Sjkim $ADDU $t_2,$at 1451238384Sjkim $ADDU $c_1,$t_2 1452238384Sjkim sltu $at,$c_1,$t_2 1453238384Sjkim $ADDU $c_2,$at 1454238384Sjkim mflo $t_1 1455238384Sjkim mfhi $t_2 1456238384Sjkim $ADDU $c_3,$t_1 1457238384Sjkim sltu $at,$c_3,$t_1 1458238384Sjkim $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); 1459238384Sjkim $ADDU $t_2,$at 1460238384Sjkim $ADDU $c_1,$t_2 1461238384Sjkim sltu $at,$c_1,$t_2 1462238384Sjkim $ADDU $c_2,$at 1463238384Sjkim $ST $c_3,8*$BNSZ($a0) # r[8]=c3; 1464238384Sjkim 1465238384Sjkim mflo $t_1 1466238384Sjkim mfhi $t_2 1467238384Sjkim $ADDU $c_1,$t_1 1468238384Sjkim sltu $at,$c_1,$t_1 1469238384Sjkim $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); 1470238384Sjkim $ADDU $t_2,$at 1471238384Sjkim $ADDU $c_2,$t_2 1472238384Sjkim sltu $c_3,$c_2,$t_2 1473238384Sjkim mflo $t_1 1474238384Sjkim mfhi $t_2 1475238384Sjkim $ADDU $c_1,$t_1 1476238384Sjkim sltu $at,$c_1,$t_1 1477238384Sjkim $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); 1478238384Sjkim $ADDU $t_2,$at 1479238384Sjkim $ADDU $c_2,$t_2 1480238384Sjkim sltu $at,$c_2,$t_2 1481238384Sjkim $ADDU $c_3,$at 1482238384Sjkim mflo $t_1 1483238384Sjkim mfhi $t_2 1484238384Sjkim $ADDU $c_1,$t_1 1485238384Sjkim sltu $at,$c_1,$t_1 1486238384Sjkim $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); 1487238384Sjkim $ADDU $t_2,$at 1488238384Sjkim $ADDU $c_2,$t_2 1489238384Sjkim sltu $at,$c_2,$t_2 1490238384Sjkim $ADDU $c_3,$at 1491238384Sjkim mflo $t_1 1492238384Sjkim mfhi $t_2 1493238384Sjkim $ADDU $c_1,$t_1 1494238384Sjkim sltu $at,$c_1,$t_1 1495238384Sjkim $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); 1496238384Sjkim $ADDU $t_2,$at 1497238384Sjkim $ADDU $c_2,$t_2 1498238384Sjkim sltu $at,$c_2,$t_2 1499238384Sjkim $ADDU $c_3,$at 1500238384Sjkim mflo $t_1 1501238384Sjkim mfhi $t_2 1502238384Sjkim $ADDU $c_1,$t_1 1503238384Sjkim sltu $at,$c_1,$t_1 1504238384Sjkim $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); 1505238384Sjkim $ADDU $t_2,$at 1506238384Sjkim $ADDU $c_2,$t_2 1507238384Sjkim sltu $at,$c_2,$t_2 1508238384Sjkim $ADDU $c_3,$at 1509238384Sjkim mflo $t_1 1510238384Sjkim mfhi $t_2 1511238384Sjkim $ADDU $c_1,$t_1 1512238384Sjkim sltu $at,$c_1,$t_1 1513238384Sjkim $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); 1514238384Sjkim $ADDU $t_2,$at 1515238384Sjkim $ADDU $c_2,$t_2 1516238384Sjkim sltu $at,$c_2,$t_2 1517238384Sjkim $ADDU $c_3,$at 1518238384Sjkim $ST $c_1,9*$BNSZ($a0) # r[9]=c1; 1519238384Sjkim 1520238384Sjkim mflo $t_1 1521238384Sjkim mfhi $t_2 1522238384Sjkim $ADDU $c_2,$t_1 1523238384Sjkim sltu $at,$c_2,$t_1 1524238384Sjkim $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); 1525238384Sjkim $ADDU $t_2,$at 1526238384Sjkim $ADDU $c_3,$t_2 1527238384Sjkim sltu $c_1,$c_3,$t_2 1528238384Sjkim mflo $t_1 1529238384Sjkim mfhi $t_2 1530238384Sjkim $ADDU $c_2,$t_1 1531238384Sjkim sltu $at,$c_2,$t_1 1532238384Sjkim $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); 1533238384Sjkim $ADDU $t_2,$at 1534238384Sjkim $ADDU $c_3,$t_2 1535238384Sjkim sltu $at,$c_3,$t_2 1536238384Sjkim $ADDU $c_1,$at 1537238384Sjkim mflo $t_1 1538238384Sjkim mfhi $t_2 1539238384Sjkim $ADDU $c_2,$t_1 1540238384Sjkim sltu $at,$c_2,$t_1 1541238384Sjkim $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); 1542238384Sjkim $ADDU $t_2,$at 1543238384Sjkim $ADDU $c_3,$t_2 1544238384Sjkim sltu $at,$c_3,$t_2 1545238384Sjkim $ADDU $c_1,$at 1546238384Sjkim mflo $t_1 1547238384Sjkim mfhi $t_2 1548238384Sjkim $ADDU $c_2,$t_1 1549238384Sjkim sltu $at,$c_2,$t_1 1550238384Sjkim $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); 1551238384Sjkim $ADDU $t_2,$at 1552238384Sjkim $ADDU $c_3,$t_2 1553238384Sjkim sltu $at,$c_3,$t_2 1554238384Sjkim $ADDU $c_1,$at 1555238384Sjkim mflo $t_1 1556238384Sjkim mfhi $t_2 1557238384Sjkim $ADDU $c_2,$t_1 1558238384Sjkim sltu $at,$c_2,$t_1 1559238384Sjkim $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); 1560238384Sjkim $ADDU $t_2,$at 1561238384Sjkim $ADDU $c_3,$t_2 1562238384Sjkim sltu $at,$c_3,$t_2 1563238384Sjkim $ADDU $c_1,$at 1564238384Sjkim $ST $c_2,10*$BNSZ($a0) # r[10]=c2; 1565238384Sjkim 1566238384Sjkim mflo $t_1 1567238384Sjkim mfhi $t_2 1568238384Sjkim $ADDU $c_3,$t_1 1569238384Sjkim sltu $at,$c_3,$t_1 1570238384Sjkim $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); 1571238384Sjkim $ADDU $t_2,$at 1572238384Sjkim $ADDU $c_1,$t_2 1573238384Sjkim sltu $c_2,$c_1,$t_2 1574238384Sjkim mflo $t_1 1575238384Sjkim mfhi $t_2 1576238384Sjkim $ADDU $c_3,$t_1 1577238384Sjkim sltu $at,$c_3,$t_1 1578238384Sjkim $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); 1579238384Sjkim $ADDU $t_2,$at 1580238384Sjkim $ADDU $c_1,$t_2 1581238384Sjkim sltu $at,$c_1,$t_2 1582238384Sjkim $ADDU $c_2,$at 1583238384Sjkim mflo $t_1 1584238384Sjkim mfhi $t_2 1585238384Sjkim $ADDU $c_3,$t_1 1586238384Sjkim sltu $at,$c_3,$t_1 1587238384Sjkim $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); 1588238384Sjkim $ADDU $t_2,$at 1589238384Sjkim $ADDU $c_1,$t_2 1590238384Sjkim sltu $at,$c_1,$t_2 1591238384Sjkim $ADDU $c_2,$at 1592238384Sjkim mflo $t_1 1593238384Sjkim mfhi $t_2 1594238384Sjkim $ADDU $c_3,$t_1 1595238384Sjkim sltu $at,$c_3,$t_1 1596238384Sjkim $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); 1597238384Sjkim $ADDU $t_2,$at 1598238384Sjkim $ADDU $c_1,$t_2 1599238384Sjkim sltu $at,$c_1,$t_2 1600238384Sjkim $ADDU $c_2,$at 1601238384Sjkim $ST $c_3,11*$BNSZ($a0) # r[11]=c3; 1602238384Sjkim 1603238384Sjkim mflo $t_1 1604238384Sjkim mfhi $t_2 1605238384Sjkim $ADDU $c_1,$t_1 1606238384Sjkim sltu $at,$c_1,$t_1 1607238384Sjkim $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); 1608238384Sjkim $ADDU $t_2,$at 1609238384Sjkim $ADDU $c_2,$t_2 1610238384Sjkim sltu $c_3,$c_2,$t_2 1611238384Sjkim mflo $t_1 1612238384Sjkim mfhi $t_2 1613238384Sjkim $ADDU $c_1,$t_1 1614238384Sjkim sltu $at,$c_1,$t_1 1615238384Sjkim $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); 1616238384Sjkim $ADDU $t_2,$at 1617238384Sjkim $ADDU $c_2,$t_2 1618238384Sjkim sltu $at,$c_2,$t_2 1619238384Sjkim $ADDU $c_3,$at 1620238384Sjkim mflo $t_1 1621238384Sjkim mfhi $t_2 1622238384Sjkim $ADDU $c_1,$t_1 1623238384Sjkim sltu $at,$c_1,$t_1 1624238384Sjkim $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); 1625238384Sjkim $ADDU $t_2,$at 1626238384Sjkim $ADDU $c_2,$t_2 1627238384Sjkim sltu $at,$c_2,$t_2 1628238384Sjkim $ADDU $c_3,$at 1629238384Sjkim $ST $c_1,12*$BNSZ($a0) # r[12]=c1; 1630238384Sjkim 1631238384Sjkim mflo $t_1 1632238384Sjkim mfhi $t_2 1633238384Sjkim $ADDU $c_2,$t_1 1634238384Sjkim sltu $at,$c_2,$t_1 1635238384Sjkim $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); 1636238384Sjkim $ADDU $t_2,$at 1637238384Sjkim $ADDU $c_3,$t_2 1638238384Sjkim sltu $c_1,$c_3,$t_2 1639238384Sjkim mflo $t_1 1640238384Sjkim mfhi $t_2 1641238384Sjkim $ADDU $c_2,$t_1 1642238384Sjkim sltu $at,$c_2,$t_1 1643238384Sjkim $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); 1644238384Sjkim $ADDU $t_2,$at 1645238384Sjkim $ADDU $c_3,$t_2 1646238384Sjkim sltu $at,$c_3,$t_2 1647238384Sjkim $ADDU $c_1,$at 1648238384Sjkim $ST $c_2,13*$BNSZ($a0) # r[13]=c2; 1649238384Sjkim 1650238384Sjkim mflo $t_1 1651238384Sjkim mfhi $t_2 1652238384Sjkim $ADDU $c_3,$t_1 1653238384Sjkim sltu $at,$c_3,$t_1 1654238384Sjkim $ADDU $t_2,$at 1655238384Sjkim $ADDU $c_1,$t_2 1656238384Sjkim $ST $c_3,14*$BNSZ($a0) # r[14]=c3; 1657238384Sjkim $ST $c_1,15*$BNSZ($a0) # r[15]=c1; 1658238384Sjkim 1659238384Sjkim .set noreorder 1660238384Sjkim___ 1661238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 1662238384Sjkim $REG_L $s5,10*$SZREG($sp) 1663238384Sjkim $REG_L $s4,9*$SZREG($sp) 1664238384Sjkim $REG_L $s3,8*$SZREG($sp) 1665238384Sjkim $REG_L $s2,7*$SZREG($sp) 1666238384Sjkim $REG_L $s1,6*$SZREG($sp) 1667238384Sjkim $REG_L $s0,5*$SZREG($sp) 1668238384Sjkim $REG_L $t3,4*$SZREG($sp) 1669238384Sjkim $REG_L $t2,3*$SZREG($sp) 1670238384Sjkim $REG_L $t1,2*$SZREG($sp) 1671238384Sjkim $REG_L $t0,1*$SZREG($sp) 1672238384Sjkim $REG_L $gp,0*$SZREG($sp) 1673238384Sjkim jr $ra 1674238384Sjkim $PTR_ADD $sp,12*$SZREG 1675238384Sjkim___ 1676238384Sjkim$code.=<<___ if ($flavour !~ /nubi/i); 1677238384Sjkim $REG_L $s5,5*$SZREG($sp) 1678238384Sjkim $REG_L $s4,4*$SZREG($sp) 1679238384Sjkim $REG_L $s3,3*$SZREG($sp) 1680238384Sjkim $REG_L $s2,2*$SZREG($sp) 1681238384Sjkim $REG_L $s1,1*$SZREG($sp) 1682238384Sjkim $REG_L $s0,0*$SZREG($sp) 1683238384Sjkim jr $ra 1684238384Sjkim $PTR_ADD $sp,6*$SZREG 1685238384Sjkim___ 1686238384Sjkim$code.=<<___; 1687238384Sjkim.end bn_mul_comba8 1688238384Sjkim 1689238384Sjkim.align 5 1690238384Sjkim.globl bn_mul_comba4 1691238384Sjkim.ent bn_mul_comba4 1692238384Sjkimbn_mul_comba4: 1693238384Sjkim___ 1694238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 1695238384Sjkim .frame $sp,6*$SZREG,$ra 1696238384Sjkim .mask 0x8000f008,-$SZREG 1697238384Sjkim .set noreorder 1698238384Sjkim $PTR_SUB $sp,6*$SZREG 1699238384Sjkim $REG_S $ra,5*$SZREG($sp) 1700238384Sjkim $REG_S $t3,4*$SZREG($sp) 1701238384Sjkim $REG_S $t2,3*$SZREG($sp) 1702238384Sjkim $REG_S $t1,2*$SZREG($sp) 1703238384Sjkim $REG_S $t0,1*$SZREG($sp) 1704238384Sjkim $REG_S $gp,0*$SZREG($sp) 1705238384Sjkim___ 1706238384Sjkim$code.=<<___; 1707238384Sjkim .set reorder 1708238384Sjkim $LD $a_0,0($a1) 1709238384Sjkim $LD $b_0,0($a2) 1710238384Sjkim $LD $a_1,$BNSZ($a1) 1711238384Sjkim $LD $a_2,2*$BNSZ($a1) 1712238384Sjkim $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); 1713238384Sjkim $LD $a_3,3*$BNSZ($a1) 1714238384Sjkim $LD $b_1,$BNSZ($a2) 1715238384Sjkim $LD $b_2,2*$BNSZ($a2) 1716238384Sjkim $LD $b_3,3*$BNSZ($a2) 1717238384Sjkim mflo $c_1 1718238384Sjkim mfhi $c_2 1719238384Sjkim $ST $c_1,0($a0) 1720238384Sjkim 1721238384Sjkim $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); 1722238384Sjkim mflo $t_1 1723238384Sjkim mfhi $t_2 1724238384Sjkim $ADDU $c_2,$t_1 1725238384Sjkim sltu $at,$c_2,$t_1 1726238384Sjkim $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); 1727238384Sjkim $ADDU $c_3,$t_2,$at 1728238384Sjkim mflo $t_1 1729238384Sjkim mfhi $t_2 1730238384Sjkim $ADDU $c_2,$t_1 1731238384Sjkim sltu $at,$c_2,$t_1 1732238384Sjkim $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); 1733238384Sjkim $ADDU $t_2,$at 1734238384Sjkim $ADDU $c_3,$t_2 1735238384Sjkim sltu $c_1,$c_3,$t_2 1736238384Sjkim $ST $c_2,$BNSZ($a0) 1737238384Sjkim 1738238384Sjkim mflo $t_1 1739238384Sjkim mfhi $t_2 1740238384Sjkim $ADDU $c_3,$t_1 1741238384Sjkim sltu $at,$c_3,$t_1 1742238384Sjkim $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); 1743238384Sjkim $ADDU $t_2,$at 1744238384Sjkim $ADDU $c_1,$t_2 1745238384Sjkim mflo $t_1 1746238384Sjkim mfhi $t_2 1747238384Sjkim $ADDU $c_3,$t_1 1748238384Sjkim sltu $at,$c_3,$t_1 1749238384Sjkim $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); 1750238384Sjkim $ADDU $t_2,$at 1751238384Sjkim $ADDU $c_1,$t_2 1752238384Sjkim sltu $c_2,$c_1,$t_2 1753238384Sjkim mflo $t_1 1754238384Sjkim mfhi $t_2 1755238384Sjkim $ADDU $c_3,$t_1 1756238384Sjkim sltu $at,$c_3,$t_1 1757238384Sjkim $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); 1758238384Sjkim $ADDU $t_2,$at 1759238384Sjkim $ADDU $c_1,$t_2 1760238384Sjkim sltu $at,$c_1,$t_2 1761238384Sjkim $ADDU $c_2,$at 1762238384Sjkim $ST $c_3,2*$BNSZ($a0) 1763238384Sjkim 1764238384Sjkim mflo $t_1 1765238384Sjkim mfhi $t_2 1766238384Sjkim $ADDU $c_1,$t_1 1767238384Sjkim sltu $at,$c_1,$t_1 1768238384Sjkim $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); 1769238384Sjkim $ADDU $t_2,$at 1770238384Sjkim $ADDU $c_2,$t_2 1771238384Sjkim sltu $c_3,$c_2,$t_2 1772238384Sjkim mflo $t_1 1773238384Sjkim mfhi $t_2 1774238384Sjkim $ADDU $c_1,$t_1 1775238384Sjkim sltu $at,$c_1,$t_1 1776238384Sjkim $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); 1777238384Sjkim $ADDU $t_2,$at 1778238384Sjkim $ADDU $c_2,$t_2 1779238384Sjkim sltu $at,$c_2,$t_2 1780238384Sjkim $ADDU $c_3,$at 1781238384Sjkim mflo $t_1 1782238384Sjkim mfhi $t_2 1783238384Sjkim $ADDU $c_1,$t_1 1784238384Sjkim sltu $at,$c_1,$t_1 1785238384Sjkim $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); 1786238384Sjkim $ADDU $t_2,$at 1787238384Sjkim $ADDU $c_2,$t_2 1788238384Sjkim sltu $at,$c_2,$t_2 1789238384Sjkim $ADDU $c_3,$at 1790238384Sjkim mflo $t_1 1791238384Sjkim mfhi $t_2 1792238384Sjkim $ADDU $c_1,$t_1 1793238384Sjkim sltu $at,$c_1,$t_1 1794238384Sjkim $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); 1795238384Sjkim $ADDU $t_2,$at 1796238384Sjkim $ADDU $c_2,$t_2 1797238384Sjkim sltu $at,$c_2,$t_2 1798238384Sjkim $ADDU $c_3,$at 1799238384Sjkim $ST $c_1,3*$BNSZ($a0) 1800238384Sjkim 1801238384Sjkim mflo $t_1 1802238384Sjkim mfhi $t_2 1803238384Sjkim $ADDU $c_2,$t_1 1804238384Sjkim sltu $at,$c_2,$t_1 1805238384Sjkim $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); 1806238384Sjkim $ADDU $t_2,$at 1807238384Sjkim $ADDU $c_3,$t_2 1808238384Sjkim sltu $c_1,$c_3,$t_2 1809238384Sjkim mflo $t_1 1810238384Sjkim mfhi $t_2 1811238384Sjkim $ADDU $c_2,$t_1 1812238384Sjkim sltu $at,$c_2,$t_1 1813238384Sjkim $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); 1814238384Sjkim $ADDU $t_2,$at 1815238384Sjkim $ADDU $c_3,$t_2 1816238384Sjkim sltu $at,$c_3,$t_2 1817238384Sjkim $ADDU $c_1,$at 1818238384Sjkim mflo $t_1 1819238384Sjkim mfhi $t_2 1820238384Sjkim $ADDU $c_2,$t_1 1821238384Sjkim sltu $at,$c_2,$t_1 1822238384Sjkim $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); 1823238384Sjkim $ADDU $t_2,$at 1824238384Sjkim $ADDU $c_3,$t_2 1825238384Sjkim sltu $at,$c_3,$t_2 1826238384Sjkim $ADDU $c_1,$at 1827238384Sjkim $ST $c_2,4*$BNSZ($a0) 1828238384Sjkim 1829238384Sjkim mflo $t_1 1830238384Sjkim mfhi $t_2 1831238384Sjkim $ADDU $c_3,$t_1 1832238384Sjkim sltu $at,$c_3,$t_1 1833238384Sjkim $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); 1834238384Sjkim $ADDU $t_2,$at 1835238384Sjkim $ADDU $c_1,$t_2 1836238384Sjkim sltu $c_2,$c_1,$t_2 1837238384Sjkim mflo $t_1 1838238384Sjkim mfhi $t_2 1839238384Sjkim $ADDU $c_3,$t_1 1840238384Sjkim sltu $at,$c_3,$t_1 1841238384Sjkim $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); 1842238384Sjkim $ADDU $t_2,$at 1843238384Sjkim $ADDU $c_1,$t_2 1844238384Sjkim sltu $at,$c_1,$t_2 1845238384Sjkim $ADDU $c_2,$at 1846238384Sjkim $ST $c_3,5*$BNSZ($a0) 1847238384Sjkim 1848238384Sjkim mflo $t_1 1849238384Sjkim mfhi $t_2 1850238384Sjkim $ADDU $c_1,$t_1 1851238384Sjkim sltu $at,$c_1,$t_1 1852238384Sjkim $ADDU $t_2,$at 1853238384Sjkim $ADDU $c_2,$t_2 1854238384Sjkim $ST $c_1,6*$BNSZ($a0) 1855238384Sjkim $ST $c_2,7*$BNSZ($a0) 1856238384Sjkim 1857238384Sjkim .set noreorder 1858238384Sjkim___ 1859238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 1860238384Sjkim $REG_L $t3,4*$SZREG($sp) 1861238384Sjkim $REG_L $t2,3*$SZREG($sp) 1862238384Sjkim $REG_L $t1,2*$SZREG($sp) 1863238384Sjkim $REG_L $t0,1*$SZREG($sp) 1864238384Sjkim $REG_L $gp,0*$SZREG($sp) 1865238384Sjkim $PTR_ADD $sp,6*$SZREG 1866238384Sjkim___ 1867238384Sjkim$code.=<<___; 1868238384Sjkim jr $ra 1869238384Sjkim nop 1870238384Sjkim.end bn_mul_comba4 1871238384Sjkim___ 1872238384Sjkim 1873238384Sjkim($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); 1874238384Sjkim 1875276864Sjkimsub add_c2 () { 1876276864Sjkimmy ($hi,$lo,$c0,$c1,$c2, 1877276864Sjkim $warm, # !$warm denotes first call with specific sequence of 1878276864Sjkim # $c_[XYZ] when there is no Z-carry to accumulate yet; 1879276864Sjkim $an,$bn # these two are arguments for multiplication which 1880276864Sjkim # result is used in *next* step [which is why it's 1881276864Sjkim # commented as "forward multiplication" below]; 1882276864Sjkim )=@_; 1883238384Sjkim$code.=<<___; 1884276864Sjkim mflo $lo 1885276864Sjkim mfhi $hi 1886276864Sjkim $ADDU $c0,$lo 1887276864Sjkim sltu $at,$c0,$lo 1888276864Sjkim $MULTU $an,$bn # forward multiplication 1889276864Sjkim $ADDU $c0,$lo 1890276864Sjkim $ADDU $at,$hi 1891276864Sjkim sltu $lo,$c0,$lo 1892276864Sjkim $ADDU $c1,$at 1893276864Sjkim $ADDU $hi,$lo 1894276864Sjkim___ 1895276864Sjkim$code.=<<___ if (!$warm); 1896276864Sjkim sltu $c2,$c1,$at 1897276864Sjkim $ADDU $c1,$hi 1898276864Sjkim sltu $hi,$c1,$hi 1899276864Sjkim $ADDU $c2,$hi 1900276864Sjkim___ 1901276864Sjkim$code.=<<___ if ($warm); 1902276864Sjkim sltu $at,$c1,$at 1903276864Sjkim $ADDU $c1,$hi 1904276864Sjkim $ADDU $c2,$at 1905276864Sjkim sltu $hi,$c1,$hi 1906276864Sjkim $ADDU $c2,$hi 1907276864Sjkim___ 1908276864Sjkim} 1909238384Sjkim 1910276864Sjkim$code.=<<___; 1911276864Sjkim 1912238384Sjkim.align 5 1913238384Sjkim.globl bn_sqr_comba8 1914238384Sjkim.ent bn_sqr_comba8 1915238384Sjkimbn_sqr_comba8: 1916238384Sjkim___ 1917238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 1918238384Sjkim .frame $sp,6*$SZREG,$ra 1919238384Sjkim .mask 0x8000f008,-$SZREG 1920238384Sjkim .set noreorder 1921238384Sjkim $PTR_SUB $sp,6*$SZREG 1922238384Sjkim $REG_S $ra,5*$SZREG($sp) 1923238384Sjkim $REG_S $t3,4*$SZREG($sp) 1924238384Sjkim $REG_S $t2,3*$SZREG($sp) 1925238384Sjkim $REG_S $t1,2*$SZREG($sp) 1926238384Sjkim $REG_S $t0,1*$SZREG($sp) 1927238384Sjkim $REG_S $gp,0*$SZREG($sp) 1928238384Sjkim___ 1929238384Sjkim$code.=<<___; 1930238384Sjkim .set reorder 1931238384Sjkim $LD $a_0,0($a1) 1932238384Sjkim $LD $a_1,$BNSZ($a1) 1933238384Sjkim $LD $a_2,2*$BNSZ($a1) 1934238384Sjkim $LD $a_3,3*$BNSZ($a1) 1935238384Sjkim 1936238384Sjkim $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); 1937238384Sjkim $LD $a_4,4*$BNSZ($a1) 1938238384Sjkim $LD $a_5,5*$BNSZ($a1) 1939238384Sjkim $LD $a_6,6*$BNSZ($a1) 1940238384Sjkim $LD $a_7,7*$BNSZ($a1) 1941238384Sjkim mflo $c_1 1942238384Sjkim mfhi $c_2 1943238384Sjkim $ST $c_1,0($a0) 1944238384Sjkim 1945238384Sjkim $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); 1946238384Sjkim mflo $t_1 1947238384Sjkim mfhi $t_2 1948238384Sjkim slt $c_1,$t_2,$zero 1949238384Sjkim $SLL $t_2,1 1950238384Sjkim $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); 1951238384Sjkim slt $a2,$t_1,$zero 1952238384Sjkim $ADDU $t_2,$a2 1953238384Sjkim $SLL $t_1,1 1954238384Sjkim $ADDU $c_2,$t_1 1955238384Sjkim sltu $at,$c_2,$t_1 1956238384Sjkim $ADDU $c_3,$t_2,$at 1957238384Sjkim $ST $c_2,$BNSZ($a0) 1958276864Sjkim___ 1959276864Sjkim &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 1960276864Sjkim $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); 1961276864Sjkim$code.=<<___; 1962238384Sjkim mflo $t_1 1963238384Sjkim mfhi $t_2 1964238384Sjkim $ADDU $c_3,$t_1 1965238384Sjkim sltu $at,$c_3,$t_1 1966238384Sjkim $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); 1967238384Sjkim $ADDU $t_2,$at 1968238384Sjkim $ADDU $c_1,$t_2 1969238384Sjkim sltu $at,$c_1,$t_2 1970238384Sjkim $ADDU $c_2,$at 1971238384Sjkim $ST $c_3,2*$BNSZ($a0) 1972276864Sjkim___ 1973276864Sjkim &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 1974276864Sjkim $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3); 1975276864Sjkim &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 1976276864Sjkim $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1); 1977276864Sjkim$code.=<<___; 1978238384Sjkim $ST $c_1,3*$BNSZ($a0) 1979276864Sjkim___ 1980276864Sjkim &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 1981276864Sjkim $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); 1982276864Sjkim &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 1983276864Sjkim $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); 1984276864Sjkim$code.=<<___; 1985238384Sjkim mflo $t_1 1986238384Sjkim mfhi $t_2 1987238384Sjkim $ADDU $c_2,$t_1 1988238384Sjkim sltu $at,$c_2,$t_1 1989238384Sjkim $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); 1990238384Sjkim $ADDU $t_2,$at 1991238384Sjkim $ADDU $c_3,$t_2 1992238384Sjkim sltu $at,$c_3,$t_2 1993238384Sjkim $ADDU $c_1,$at 1994238384Sjkim $ST $c_2,4*$BNSZ($a0) 1995276864Sjkim___ 1996276864Sjkim &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 1997276864Sjkim $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2); 1998276864Sjkim &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 1999276864Sjkim $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2); 2000276864Sjkim &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2001276864Sjkim $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3); 2002276864Sjkim$code.=<<___; 2003238384Sjkim $ST $c_3,5*$BNSZ($a0) 2004276864Sjkim___ 2005276864Sjkim &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2006276864Sjkim $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3); 2007276864Sjkim &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2008276864Sjkim $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3); 2009276864Sjkim &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2010276864Sjkim $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); 2011276864Sjkim$code.=<<___; 2012238384Sjkim mflo $t_1 2013238384Sjkim mfhi $t_2 2014238384Sjkim $ADDU $c_1,$t_1 2015238384Sjkim sltu $at,$c_1,$t_1 2016238384Sjkim $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); 2017238384Sjkim $ADDU $t_2,$at 2018238384Sjkim $ADDU $c_2,$t_2 2019238384Sjkim sltu $at,$c_2,$t_2 2020238384Sjkim $ADDU $c_3,$at 2021238384Sjkim $ST $c_1,6*$BNSZ($a0) 2022276864Sjkim___ 2023276864Sjkim &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2024276864Sjkim $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1); 2025276864Sjkim &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2026276864Sjkim $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1); 2027276864Sjkim &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2028276864Sjkim $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1); 2029276864Sjkim &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2030276864Sjkim $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2); 2031276864Sjkim$code.=<<___; 2032238384Sjkim $ST $c_2,7*$BNSZ($a0) 2033276864Sjkim___ 2034276864Sjkim &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2035276864Sjkim $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2); 2036276864Sjkim &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2037276864Sjkim $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2); 2038276864Sjkim &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2039276864Sjkim $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2); 2040276864Sjkim$code.=<<___; 2041238384Sjkim mflo $t_1 2042238384Sjkim mfhi $t_2 2043238384Sjkim $ADDU $c_3,$t_1 2044238384Sjkim sltu $at,$c_3,$t_1 2045238384Sjkim $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); 2046238384Sjkim $ADDU $t_2,$at 2047238384Sjkim $ADDU $c_1,$t_2 2048238384Sjkim sltu $at,$c_1,$t_2 2049238384Sjkim $ADDU $c_2,$at 2050238384Sjkim $ST $c_3,8*$BNSZ($a0) 2051276864Sjkim___ 2052276864Sjkim &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2053276864Sjkim $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3); 2054276864Sjkim &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2055276864Sjkim $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3); 2056276864Sjkim &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2057276864Sjkim $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1); 2058276864Sjkim$code.=<<___; 2059238384Sjkim $ST $c_1,9*$BNSZ($a0) 2060276864Sjkim___ 2061276864Sjkim &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2062276864Sjkim $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1); 2063276864Sjkim &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2064276864Sjkim $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1); 2065276864Sjkim$code.=<<___; 2066238384Sjkim mflo $t_1 2067238384Sjkim mfhi $t_2 2068238384Sjkim $ADDU $c_2,$t_1 2069238384Sjkim sltu $at,$c_2,$t_1 2070238384Sjkim $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); 2071238384Sjkim $ADDU $t_2,$at 2072238384Sjkim $ADDU $c_3,$t_2 2073238384Sjkim sltu $at,$c_3,$t_2 2074238384Sjkim $ADDU $c_1,$at 2075238384Sjkim $ST $c_2,10*$BNSZ($a0) 2076276864Sjkim___ 2077276864Sjkim &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2078276864Sjkim $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2); 2079276864Sjkim &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2080276864Sjkim $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3); 2081276864Sjkim$code.=<<___; 2082238384Sjkim $ST $c_3,11*$BNSZ($a0) 2083276864Sjkim___ 2084276864Sjkim &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2085276864Sjkim $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3); 2086276864Sjkim$code.=<<___; 2087238384Sjkim mflo $t_1 2088238384Sjkim mfhi $t_2 2089238384Sjkim $ADDU $c_1,$t_1 2090238384Sjkim sltu $at,$c_1,$t_1 2091238384Sjkim $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); 2092238384Sjkim $ADDU $t_2,$at 2093238384Sjkim $ADDU $c_2,$t_2 2094238384Sjkim sltu $at,$c_2,$t_2 2095238384Sjkim $ADDU $c_3,$at 2096238384Sjkim $ST $c_1,12*$BNSZ($a0) 2097276864Sjkim___ 2098276864Sjkim &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2099276864Sjkim $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2); 2100276864Sjkim$code.=<<___; 2101238384Sjkim $ST $c_2,13*$BNSZ($a0) 2102238384Sjkim 2103238384Sjkim mflo $t_1 2104238384Sjkim mfhi $t_2 2105238384Sjkim $ADDU $c_3,$t_1 2106238384Sjkim sltu $at,$c_3,$t_1 2107238384Sjkim $ADDU $t_2,$at 2108238384Sjkim $ADDU $c_1,$t_2 2109238384Sjkim $ST $c_3,14*$BNSZ($a0) 2110238384Sjkim $ST $c_1,15*$BNSZ($a0) 2111238384Sjkim 2112238384Sjkim .set noreorder 2113238384Sjkim___ 2114238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 2115238384Sjkim $REG_L $t3,4*$SZREG($sp) 2116238384Sjkim $REG_L $t2,3*$SZREG($sp) 2117238384Sjkim $REG_L $t1,2*$SZREG($sp) 2118238384Sjkim $REG_L $t0,1*$SZREG($sp) 2119238384Sjkim $REG_L $gp,0*$SZREG($sp) 2120238384Sjkim $PTR_ADD $sp,6*$SZREG 2121238384Sjkim___ 2122238384Sjkim$code.=<<___; 2123238384Sjkim jr $ra 2124238384Sjkim nop 2125238384Sjkim.end bn_sqr_comba8 2126238384Sjkim 2127238384Sjkim.align 5 2128238384Sjkim.globl bn_sqr_comba4 2129238384Sjkim.ent bn_sqr_comba4 2130238384Sjkimbn_sqr_comba4: 2131238384Sjkim___ 2132238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 2133238384Sjkim .frame $sp,6*$SZREG,$ra 2134238384Sjkim .mask 0x8000f008,-$SZREG 2135238384Sjkim .set noreorder 2136238384Sjkim $PTR_SUB $sp,6*$SZREG 2137238384Sjkim $REG_S $ra,5*$SZREG($sp) 2138238384Sjkim $REG_S $t3,4*$SZREG($sp) 2139238384Sjkim $REG_S $t2,3*$SZREG($sp) 2140238384Sjkim $REG_S $t1,2*$SZREG($sp) 2141238384Sjkim $REG_S $t0,1*$SZREG($sp) 2142238384Sjkim $REG_S $gp,0*$SZREG($sp) 2143238384Sjkim___ 2144238384Sjkim$code.=<<___; 2145238384Sjkim .set reorder 2146238384Sjkim $LD $a_0,0($a1) 2147238384Sjkim $LD $a_1,$BNSZ($a1) 2148238384Sjkim $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); 2149238384Sjkim $LD $a_2,2*$BNSZ($a1) 2150238384Sjkim $LD $a_3,3*$BNSZ($a1) 2151238384Sjkim mflo $c_1 2152238384Sjkim mfhi $c_2 2153238384Sjkim $ST $c_1,0($a0) 2154238384Sjkim 2155238384Sjkim $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); 2156238384Sjkim mflo $t_1 2157238384Sjkim mfhi $t_2 2158238384Sjkim slt $c_1,$t_2,$zero 2159238384Sjkim $SLL $t_2,1 2160238384Sjkim $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); 2161238384Sjkim slt $a2,$t_1,$zero 2162238384Sjkim $ADDU $t_2,$a2 2163238384Sjkim $SLL $t_1,1 2164238384Sjkim $ADDU $c_2,$t_1 2165238384Sjkim sltu $at,$c_2,$t_1 2166238384Sjkim $ADDU $c_3,$t_2,$at 2167238384Sjkim $ST $c_2,$BNSZ($a0) 2168276864Sjkim___ 2169276864Sjkim &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2170276864Sjkim $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); 2171276864Sjkim$code.=<<___; 2172238384Sjkim mflo $t_1 2173238384Sjkim mfhi $t_2 2174238384Sjkim $ADDU $c_3,$t_1 2175238384Sjkim sltu $at,$c_3,$t_1 2176238384Sjkim $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); 2177238384Sjkim $ADDU $t_2,$at 2178238384Sjkim $ADDU $c_1,$t_2 2179238384Sjkim sltu $at,$c_1,$t_2 2180238384Sjkim $ADDU $c_2,$at 2181238384Sjkim $ST $c_3,2*$BNSZ($a0) 2182276864Sjkim___ 2183276864Sjkim &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2184276864Sjkim $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3); 2185276864Sjkim &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2186276864Sjkim $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); 2187276864Sjkim$code.=<<___; 2188238384Sjkim $ST $c_1,3*$BNSZ($a0) 2189276864Sjkim___ 2190276864Sjkim &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2191276864Sjkim $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); 2192276864Sjkim$code.=<<___; 2193238384Sjkim mflo $t_1 2194238384Sjkim mfhi $t_2 2195238384Sjkim $ADDU $c_2,$t_1 2196238384Sjkim sltu $at,$c_2,$t_1 2197238384Sjkim $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); 2198238384Sjkim $ADDU $t_2,$at 2199238384Sjkim $ADDU $c_3,$t_2 2200238384Sjkim sltu $at,$c_3,$t_2 2201238384Sjkim $ADDU $c_1,$at 2202238384Sjkim $ST $c_2,4*$BNSZ($a0) 2203276864Sjkim___ 2204276864Sjkim &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2205276864Sjkim $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); 2206276864Sjkim$code.=<<___; 2207238384Sjkim $ST $c_3,5*$BNSZ($a0) 2208238384Sjkim 2209238384Sjkim mflo $t_1 2210238384Sjkim mfhi $t_2 2211238384Sjkim $ADDU $c_1,$t_1 2212238384Sjkim sltu $at,$c_1,$t_1 2213238384Sjkim $ADDU $t_2,$at 2214238384Sjkim $ADDU $c_2,$t_2 2215238384Sjkim $ST $c_1,6*$BNSZ($a0) 2216238384Sjkim $ST $c_2,7*$BNSZ($a0) 2217238384Sjkim 2218238384Sjkim .set noreorder 2219238384Sjkim___ 2220238384Sjkim$code.=<<___ if ($flavour =~ /nubi/i); 2221238384Sjkim $REG_L $t3,4*$SZREG($sp) 2222238384Sjkim $REG_L $t2,3*$SZREG($sp) 2223238384Sjkim $REG_L $t1,2*$SZREG($sp) 2224238384Sjkim $REG_L $t0,1*$SZREG($sp) 2225238384Sjkim $REG_L $gp,0*$SZREG($sp) 2226238384Sjkim $PTR_ADD $sp,6*$SZREG 2227238384Sjkim___ 2228238384Sjkim$code.=<<___; 2229238384Sjkim jr $ra 2230238384Sjkim nop 2231238384Sjkim.end bn_sqr_comba4 2232238384Sjkim___ 2233238384Sjkimprint $code; 2234238384Sjkimclose STDOUT; 2235