1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# October 2005 11238384Sjkim# 12238384Sjkim# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? 13238384Sjkim# Because unlike integer multiplier, which simply stalls whole CPU, 14238384Sjkim# FPU is fully pipelined and can effectively emit 48 bit partial 15238384Sjkim# product every cycle. Why not blended SPARC v9? One can argue that 16238384Sjkim# making this module dependent on UltraSPARC VIS extension limits its 17238384Sjkim# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) 18238384Sjkim# implementations from compatibility matrix. But the rest, whole Sun 19238384Sjkim# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support 20238384Sjkim# VIS extension instructions used in this module. This is considered 21238384Sjkim# good enough to not care about HAL SPARC64 users [if any] who have 22238384Sjkim# integer-only pure SPARCv9 module to "fall down" to. 23238384Sjkim 24238384Sjkim# USI&II cores currently exhibit uniform 2x improvement [over pre- 25238384Sjkim# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII 26238384Sjkim# performance improves few percents for shorter keys and worsens few 27238384Sjkim# percents for longer keys. This is because USIII integer multiplier 28238384Sjkim# is >3x faster than USI&II one, which is harder to match [but see 29238384Sjkim# TODO list below]. It should also be noted that SPARC64 V features 30238384Sjkim# out-of-order execution, which *might* mean that integer multiplier 31238384Sjkim# is pipelined, which in turn *might* be impossible to match... On 32238384Sjkim# additional note, SPARC64 V implements FP Multiply-Add instruction, 33238384Sjkim# which is perfectly usable in this context... In other words, as far 34238384Sjkim# as Fujitsu SPARC64 V goes, talk to the author:-) 35238384Sjkim 36238384Sjkim# The implementation implies following "non-natural" limitations on 37238384Sjkim# input arguments: 38238384Sjkim# - num may not be less than 4; 39238384Sjkim# - num has to be even; 40238384Sjkim# Failure to meet either condition has no fatal effects, simply 41238384Sjkim# doesn't give any performance gain. 42238384Sjkim 43238384Sjkim# TODO: 44238384Sjkim# - modulo-schedule inner loop for better performance (on in-order 45238384Sjkim# execution core such as UltraSPARC this shall result in further 46238384Sjkim# noticeable(!) improvement); 47238384Sjkim# - dedicated squaring procedure[?]; 48238384Sjkim 49238384Sjkim###################################################################### 50238384Sjkim# November 2006 51238384Sjkim# 52238384Sjkim# Modulo-scheduled inner loops allow to interleave floating point and 53238384Sjkim# integer instructions and minimize Read-After-Write penalties. This 54238384Sjkim# results in *further* 20-50% perfromance improvement [depending on 55238384Sjkim# key length, more for longer keys] on USI&II cores and 30-80% - on 56238384Sjkim# USIII&IV. 57238384Sjkim 58238384Sjkim$fname="bn_mul_mont_fpu"; 59238384Sjkim$bits=32; 60238384Sjkimfor (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } 61238384Sjkim 62238384Sjkimif ($bits==64) { 63238384Sjkim $bias=2047; 64238384Sjkim $frame=192; 65238384Sjkim} else { 66238384Sjkim $bias=0; 67238384Sjkim $frame=128; # 96 rounded up to largest known cache-line 68238384Sjkim} 69238384Sjkim$locals=64; 70238384Sjkim 71238384Sjkim# In order to provide for 32-/64-bit ABI duality, I keep integers wider 72238384Sjkim# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used 73238384Sjkim# exclusively for pointers, indexes and other small values... 74238384Sjkim# int bn_mul_mont( 75238384Sjkim$rp="%i0"; # BN_ULONG *rp, 76238384Sjkim$ap="%i1"; # const BN_ULONG *ap, 77238384Sjkim$bp="%i2"; # const BN_ULONG *bp, 78238384Sjkim$np="%i3"; # const BN_ULONG *np, 79238384Sjkim$n0="%i4"; # const BN_ULONG *n0, 80238384Sjkim$num="%i5"; # int num); 81238384Sjkim 82238384Sjkim$tp="%l0"; # t[num] 83238384Sjkim$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved 84238384Sjkim$ap_h="%l2"; # to these four vectors as double-precision FP values. 85238384Sjkim$np_l="%l3"; # This way a bunch of fxtods are eliminated in second 86238384Sjkim$np_h="%l4"; # loop and L1-cache aliasing is minimized... 87238384Sjkim$i="%l5"; 88238384Sjkim$j="%l6"; 89238384Sjkim$mask="%l7"; # 16-bit mask, 0xffff 90238384Sjkim 91238384Sjkim$n0="%g4"; # reassigned(!) to "64-bit" register 92238384Sjkim$carry="%i4"; # %i4 reused(!) for a carry bit 93238384Sjkim 94238384Sjkim# FP register naming chart 95238384Sjkim# 96238384Sjkim# ..HILO 97238384Sjkim# dcba 98238384Sjkim# -------- 99238384Sjkim# LOa 100238384Sjkim# LOb 101238384Sjkim# LOc 102238384Sjkim# LOd 103238384Sjkim# HIa 104238384Sjkim# HIb 105238384Sjkim# HIc 106238384Sjkim# HId 107238384Sjkim# ..a 108238384Sjkim# ..b 109238384Sjkim$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; 110238384Sjkim$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; 111238384Sjkim$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; 112238384Sjkim$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; 113238384Sjkim 114238384Sjkim$dota="%f24"; $dotb="%f26"; 115238384Sjkim 116238384Sjkim$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; 117238384Sjkim$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; 118238384Sjkim$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; 119238384Sjkim$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; 120238384Sjkim 121238384Sjkim$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load 122238384Sjkim 123238384Sjkim$code=<<___; 124238384Sjkim.section ".text",#alloc,#execinstr 125238384Sjkim 126238384Sjkim.global $fname 127238384Sjkim.align 32 128238384Sjkim$fname: 129238384Sjkim save %sp,-$frame-$locals,%sp 130238384Sjkim 131238384Sjkim cmp $num,4 132238384Sjkim bl,a,pn %icc,.Lret 133238384Sjkim clr %i0 134238384Sjkim andcc $num,1,%g0 ! $num has to be even... 135238384Sjkim bnz,a,pn %icc,.Lret 136238384Sjkim clr %i0 ! signal "unsupported input value" 137238384Sjkim 138238384Sjkim srl $num,1,$num 139238384Sjkim sethi %hi(0xffff),$mask 140238384Sjkim ld [%i4+0],$n0 ! $n0 reassigned, remember? 141238384Sjkim or $mask,%lo(0xffff),$mask 142238384Sjkim ld [%i4+4],%o0 143238384Sjkim sllx %o0,32,%o0 144238384Sjkim or %o0,$n0,$n0 ! $n0=n0[1].n0[0] 145238384Sjkim 146238384Sjkim sll $num,3,$num ! num*=8 147238384Sjkim 148238384Sjkim add %sp,$bias,%o0 ! real top of stack 149238384Sjkim sll $num,2,%o1 150238384Sjkim add %o1,$num,%o1 ! %o1=num*5 151238384Sjkim sub %o0,%o1,%o0 152238384Sjkim and %o0,-2048,%o0 ! optimize TLB utilization 153238384Sjkim sub %o0,$bias,%sp ! alloca(5*num*8) 154238384Sjkim 155238384Sjkim rd %asi,%o7 ! save %asi 156238384Sjkim add %sp,$bias+$frame+$locals,$tp 157238384Sjkim add $tp,$num,$ap_l 158238384Sjkim add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! 159238384Sjkim add $ap_l,$num,$ap_h 160238384Sjkim add $ap_h,$num,$np_l 161238384Sjkim add $np_l,$num,$np_h 162238384Sjkim 163238384Sjkim wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads 164238384Sjkim 165238384Sjkim add $rp,$num,$rp ! readjust input pointers to point 166238384Sjkim add $ap,$num,$ap ! at the ends too... 167238384Sjkim add $bp,$num,$bp 168238384Sjkim add $np,$num,$np 169238384Sjkim 170238384Sjkim stx %o7,[%sp+$bias+$frame+48] ! save %asi 171238384Sjkim 172238384Sjkim sub %g0,$num,$i ! i=-num 173238384Sjkim sub %g0,$num,$j ! j=-num 174238384Sjkim 175238384Sjkim add $ap,$j,%o3 176238384Sjkim add $bp,$i,%o4 177238384Sjkim 178238384Sjkim ld [%o3+4],%g1 ! bp[0] 179238384Sjkim ld [%o3+0],%o0 180238384Sjkim ld [%o4+4],%g5 ! ap[0] 181238384Sjkim sllx %g1,32,%g1 182238384Sjkim ld [%o4+0],%o1 183238384Sjkim sllx %g5,32,%g5 184238384Sjkim or %g1,%o0,%o0 185238384Sjkim or %g5,%o1,%o1 186238384Sjkim 187238384Sjkim add $np,$j,%o5 188238384Sjkim 189238384Sjkim mulx %o1,%o0,%o0 ! ap[0]*bp[0] 190238384Sjkim mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 191238384Sjkim stx %o0,[%sp+$bias+$frame+0] 192238384Sjkim 193238384Sjkim ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words 194238384Sjkim fzeros $alo 195238384Sjkim ld [%o3+4],$ahi_ 196238384Sjkim fzeros $ahi 197238384Sjkim ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words 198238384Sjkim fzeros $nlo 199238384Sjkim ld [%o5+4],$nhi_ 200238384Sjkim fzeros $nhi 201238384Sjkim 202238384Sjkim ! transfer b[i] to FPU as 4x16-bit values 203238384Sjkim ldda [%o4+2]%asi,$ba 204238384Sjkim fxtod $alo,$alo 205238384Sjkim ldda [%o4+0]%asi,$bb 206238384Sjkim fxtod $ahi,$ahi 207238384Sjkim ldda [%o4+6]%asi,$bc 208238384Sjkim fxtod $nlo,$nlo 209238384Sjkim ldda [%o4+4]%asi,$bd 210238384Sjkim fxtod $nhi,$nhi 211238384Sjkim 212238384Sjkim ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values 213238384Sjkim ldda [%sp+$bias+$frame+6]%asi,$na 214238384Sjkim fxtod $ba,$ba 215238384Sjkim ldda [%sp+$bias+$frame+4]%asi,$nb 216238384Sjkim fxtod $bb,$bb 217238384Sjkim ldda [%sp+$bias+$frame+2]%asi,$nc 218238384Sjkim fxtod $bc,$bc 219238384Sjkim ldda [%sp+$bias+$frame+0]%asi,$nd 220238384Sjkim fxtod $bd,$bd 221238384Sjkim 222238384Sjkim std $alo,[$ap_l+$j] ! save smashed ap[j] in double format 223238384Sjkim fxtod $na,$na 224238384Sjkim std $ahi,[$ap_h+$j] 225238384Sjkim fxtod $nb,$nb 226238384Sjkim std $nlo,[$np_l+$j] ! save smashed np[j] in double format 227238384Sjkim fxtod $nc,$nc 228238384Sjkim std $nhi,[$np_h+$j] 229238384Sjkim fxtod $nd,$nd 230238384Sjkim 231238384Sjkim fmuld $alo,$ba,$aloa 232238384Sjkim fmuld $nlo,$na,$nloa 233238384Sjkim fmuld $alo,$bb,$alob 234238384Sjkim fmuld $nlo,$nb,$nlob 235238384Sjkim fmuld $alo,$bc,$aloc 236238384Sjkim faddd $aloa,$nloa,$nloa 237238384Sjkim fmuld $nlo,$nc,$nloc 238238384Sjkim fmuld $alo,$bd,$alod 239238384Sjkim faddd $alob,$nlob,$nlob 240238384Sjkim fmuld $nlo,$nd,$nlod 241238384Sjkim fmuld $ahi,$ba,$ahia 242238384Sjkim faddd $aloc,$nloc,$nloc 243238384Sjkim fmuld $nhi,$na,$nhia 244238384Sjkim fmuld $ahi,$bb,$ahib 245238384Sjkim faddd $alod,$nlod,$nlod 246238384Sjkim fmuld $nhi,$nb,$nhib 247238384Sjkim fmuld $ahi,$bc,$ahic 248238384Sjkim faddd $ahia,$nhia,$nhia 249238384Sjkim fmuld $nhi,$nc,$nhic 250238384Sjkim fmuld $ahi,$bd,$ahid 251238384Sjkim faddd $ahib,$nhib,$nhib 252238384Sjkim fmuld $nhi,$nd,$nhid 253238384Sjkim 254238384Sjkim faddd $ahic,$nhic,$dota ! $nhic 255238384Sjkim faddd $ahid,$nhid,$dotb ! $nhid 256238384Sjkim 257238384Sjkim faddd $nloc,$nhia,$nloc 258238384Sjkim faddd $nlod,$nhib,$nlod 259238384Sjkim 260238384Sjkim fdtox $nloa,$nloa 261238384Sjkim fdtox $nlob,$nlob 262238384Sjkim fdtox $nloc,$nloc 263238384Sjkim fdtox $nlod,$nlod 264238384Sjkim 265238384Sjkim std $nloa,[%sp+$bias+$frame+0] 266238384Sjkim add $j,8,$j 267238384Sjkim std $nlob,[%sp+$bias+$frame+8] 268238384Sjkim add $ap,$j,%o4 269238384Sjkim std $nloc,[%sp+$bias+$frame+16] 270238384Sjkim add $np,$j,%o5 271238384Sjkim std $nlod,[%sp+$bias+$frame+24] 272238384Sjkim 273238384Sjkim ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words 274238384Sjkim fzeros $alo 275238384Sjkim ld [%o4+4],$ahi_ 276238384Sjkim fzeros $ahi 277238384Sjkim ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words 278238384Sjkim fzeros $nlo 279238384Sjkim ld [%o5+4],$nhi_ 280238384Sjkim fzeros $nhi 281238384Sjkim 282238384Sjkim fxtod $alo,$alo 283238384Sjkim fxtod $ahi,$ahi 284238384Sjkim fxtod $nlo,$nlo 285238384Sjkim fxtod $nhi,$nhi 286238384Sjkim 287238384Sjkim ldx [%sp+$bias+$frame+0],%o0 288238384Sjkim fmuld $alo,$ba,$aloa 289238384Sjkim ldx [%sp+$bias+$frame+8],%o1 290238384Sjkim fmuld $nlo,$na,$nloa 291238384Sjkim ldx [%sp+$bias+$frame+16],%o2 292238384Sjkim fmuld $alo,$bb,$alob 293238384Sjkim ldx [%sp+$bias+$frame+24],%o3 294238384Sjkim fmuld $nlo,$nb,$nlob 295238384Sjkim 296238384Sjkim srlx %o0,16,%o7 297238384Sjkim std $alo,[$ap_l+$j] ! save smashed ap[j] in double format 298238384Sjkim fmuld $alo,$bc,$aloc 299238384Sjkim add %o7,%o1,%o1 300238384Sjkim std $ahi,[$ap_h+$j] 301238384Sjkim faddd $aloa,$nloa,$nloa 302238384Sjkim fmuld $nlo,$nc,$nloc 303238384Sjkim srlx %o1,16,%o7 304238384Sjkim std $nlo,[$np_l+$j] ! save smashed np[j] in double format 305238384Sjkim fmuld $alo,$bd,$alod 306238384Sjkim add %o7,%o2,%o2 307238384Sjkim std $nhi,[$np_h+$j] 308238384Sjkim faddd $alob,$nlob,$nlob 309238384Sjkim fmuld $nlo,$nd,$nlod 310238384Sjkim srlx %o2,16,%o7 311238384Sjkim fmuld $ahi,$ba,$ahia 312238384Sjkim add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 313238384Sjkim faddd $aloc,$nloc,$nloc 314238384Sjkim fmuld $nhi,$na,$nhia 315238384Sjkim !and %o0,$mask,%o0 316238384Sjkim !and %o1,$mask,%o1 317238384Sjkim !and %o2,$mask,%o2 318238384Sjkim !sllx %o1,16,%o1 319238384Sjkim !sllx %o2,32,%o2 320238384Sjkim !sllx %o3,48,%o7 321238384Sjkim !or %o1,%o0,%o0 322238384Sjkim !or %o2,%o0,%o0 323238384Sjkim !or %o7,%o0,%o0 ! 64-bit result 324238384Sjkim srlx %o3,16,%g1 ! 34-bit carry 325238384Sjkim fmuld $ahi,$bb,$ahib 326238384Sjkim 327238384Sjkim faddd $alod,$nlod,$nlod 328238384Sjkim fmuld $nhi,$nb,$nhib 329238384Sjkim fmuld $ahi,$bc,$ahic 330238384Sjkim faddd $ahia,$nhia,$nhia 331238384Sjkim fmuld $nhi,$nc,$nhic 332238384Sjkim fmuld $ahi,$bd,$ahid 333238384Sjkim faddd $ahib,$nhib,$nhib 334238384Sjkim fmuld $nhi,$nd,$nhid 335238384Sjkim 336238384Sjkim faddd $dota,$nloa,$nloa 337238384Sjkim faddd $dotb,$nlob,$nlob 338238384Sjkim faddd $ahic,$nhic,$dota ! $nhic 339238384Sjkim faddd $ahid,$nhid,$dotb ! $nhid 340238384Sjkim 341238384Sjkim faddd $nloc,$nhia,$nloc 342238384Sjkim faddd $nlod,$nhib,$nlod 343238384Sjkim 344238384Sjkim fdtox $nloa,$nloa 345238384Sjkim fdtox $nlob,$nlob 346238384Sjkim fdtox $nloc,$nloc 347238384Sjkim fdtox $nlod,$nlod 348238384Sjkim 349238384Sjkim std $nloa,[%sp+$bias+$frame+0] 350238384Sjkim std $nlob,[%sp+$bias+$frame+8] 351238384Sjkim addcc $j,8,$j 352238384Sjkim std $nloc,[%sp+$bias+$frame+16] 353238384Sjkim bz,pn %icc,.L1stskip 354238384Sjkim std $nlod,[%sp+$bias+$frame+24] 355238384Sjkim 356238384Sjkim.align 32 ! incidentally already aligned ! 357238384Sjkim.L1st: 358238384Sjkim add $ap,$j,%o4 359238384Sjkim add $np,$j,%o5 360238384Sjkim ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words 361238384Sjkim fzeros $alo 362238384Sjkim ld [%o4+4],$ahi_ 363238384Sjkim fzeros $ahi 364238384Sjkim ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words 365238384Sjkim fzeros $nlo 366238384Sjkim ld [%o5+4],$nhi_ 367238384Sjkim fzeros $nhi 368238384Sjkim 369238384Sjkim fxtod $alo,$alo 370238384Sjkim fxtod $ahi,$ahi 371238384Sjkim fxtod $nlo,$nlo 372238384Sjkim fxtod $nhi,$nhi 373238384Sjkim 374238384Sjkim ldx [%sp+$bias+$frame+0],%o0 375238384Sjkim fmuld $alo,$ba,$aloa 376238384Sjkim ldx [%sp+$bias+$frame+8],%o1 377238384Sjkim fmuld $nlo,$na,$nloa 378238384Sjkim ldx [%sp+$bias+$frame+16],%o2 379238384Sjkim fmuld $alo,$bb,$alob 380238384Sjkim ldx [%sp+$bias+$frame+24],%o3 381238384Sjkim fmuld $nlo,$nb,$nlob 382238384Sjkim 383238384Sjkim srlx %o0,16,%o7 384238384Sjkim std $alo,[$ap_l+$j] ! save smashed ap[j] in double format 385238384Sjkim fmuld $alo,$bc,$aloc 386238384Sjkim add %o7,%o1,%o1 387238384Sjkim std $ahi,[$ap_h+$j] 388238384Sjkim faddd $aloa,$nloa,$nloa 389238384Sjkim fmuld $nlo,$nc,$nloc 390238384Sjkim srlx %o1,16,%o7 391238384Sjkim std $nlo,[$np_l+$j] ! save smashed np[j] in double format 392238384Sjkim fmuld $alo,$bd,$alod 393238384Sjkim add %o7,%o2,%o2 394238384Sjkim std $nhi,[$np_h+$j] 395238384Sjkim faddd $alob,$nlob,$nlob 396238384Sjkim fmuld $nlo,$nd,$nlod 397238384Sjkim srlx %o2,16,%o7 398238384Sjkim fmuld $ahi,$ba,$ahia 399238384Sjkim add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 400238384Sjkim and %o0,$mask,%o0 401238384Sjkim faddd $aloc,$nloc,$nloc 402238384Sjkim fmuld $nhi,$na,$nhia 403238384Sjkim and %o1,$mask,%o1 404238384Sjkim and %o2,$mask,%o2 405238384Sjkim fmuld $ahi,$bb,$ahib 406238384Sjkim sllx %o1,16,%o1 407238384Sjkim faddd $alod,$nlod,$nlod 408238384Sjkim fmuld $nhi,$nb,$nhib 409238384Sjkim sllx %o2,32,%o2 410238384Sjkim fmuld $ahi,$bc,$ahic 411238384Sjkim sllx %o3,48,%o7 412238384Sjkim or %o1,%o0,%o0 413238384Sjkim faddd $ahia,$nhia,$nhia 414238384Sjkim fmuld $nhi,$nc,$nhic 415238384Sjkim or %o2,%o0,%o0 416238384Sjkim fmuld $ahi,$bd,$ahid 417238384Sjkim or %o7,%o0,%o0 ! 64-bit result 418238384Sjkim faddd $ahib,$nhib,$nhib 419238384Sjkim fmuld $nhi,$nd,$nhid 420238384Sjkim addcc %g1,%o0,%o0 421238384Sjkim faddd $dota,$nloa,$nloa 422238384Sjkim srlx %o3,16,%g1 ! 34-bit carry 423238384Sjkim faddd $dotb,$nlob,$nlob 424238384Sjkim bcs,a %xcc,.+8 425238384Sjkim add %g1,1,%g1 426238384Sjkim 427238384Sjkim stx %o0,[$tp] ! tp[j-1]= 428238384Sjkim 429238384Sjkim faddd $ahic,$nhic,$dota ! $nhic 430238384Sjkim faddd $ahid,$nhid,$dotb ! $nhid 431238384Sjkim 432238384Sjkim faddd $nloc,$nhia,$nloc 433238384Sjkim faddd $nlod,$nhib,$nlod 434238384Sjkim 435238384Sjkim fdtox $nloa,$nloa 436238384Sjkim fdtox $nlob,$nlob 437238384Sjkim fdtox $nloc,$nloc 438238384Sjkim fdtox $nlod,$nlod 439238384Sjkim 440238384Sjkim std $nloa,[%sp+$bias+$frame+0] 441238384Sjkim std $nlob,[%sp+$bias+$frame+8] 442238384Sjkim std $nloc,[%sp+$bias+$frame+16] 443238384Sjkim std $nlod,[%sp+$bias+$frame+24] 444238384Sjkim 445238384Sjkim addcc $j,8,$j 446238384Sjkim bnz,pt %icc,.L1st 447238384Sjkim add $tp,8,$tp 448238384Sjkim 449238384Sjkim.L1stskip: 450238384Sjkim fdtox $dota,$dota 451238384Sjkim fdtox $dotb,$dotb 452238384Sjkim 453238384Sjkim ldx [%sp+$bias+$frame+0],%o0 454238384Sjkim ldx [%sp+$bias+$frame+8],%o1 455238384Sjkim ldx [%sp+$bias+$frame+16],%o2 456238384Sjkim ldx [%sp+$bias+$frame+24],%o3 457238384Sjkim 458238384Sjkim srlx %o0,16,%o7 459238384Sjkim std $dota,[%sp+$bias+$frame+32] 460238384Sjkim add %o7,%o1,%o1 461238384Sjkim std $dotb,[%sp+$bias+$frame+40] 462238384Sjkim srlx %o1,16,%o7 463238384Sjkim add %o7,%o2,%o2 464238384Sjkim srlx %o2,16,%o7 465238384Sjkim add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 466238384Sjkim and %o0,$mask,%o0 467238384Sjkim and %o1,$mask,%o1 468238384Sjkim and %o2,$mask,%o2 469238384Sjkim sllx %o1,16,%o1 470238384Sjkim sllx %o2,32,%o2 471238384Sjkim sllx %o3,48,%o7 472238384Sjkim or %o1,%o0,%o0 473238384Sjkim or %o2,%o0,%o0 474238384Sjkim or %o7,%o0,%o0 ! 64-bit result 475238384Sjkim ldx [%sp+$bias+$frame+32],%o4 476238384Sjkim addcc %g1,%o0,%o0 477238384Sjkim ldx [%sp+$bias+$frame+40],%o5 478238384Sjkim srlx %o3,16,%g1 ! 34-bit carry 479238384Sjkim bcs,a %xcc,.+8 480238384Sjkim add %g1,1,%g1 481238384Sjkim 482238384Sjkim stx %o0,[$tp] ! tp[j-1]= 483238384Sjkim add $tp,8,$tp 484238384Sjkim 485238384Sjkim srlx %o4,16,%o7 486238384Sjkim add %o7,%o5,%o5 487238384Sjkim and %o4,$mask,%o4 488238384Sjkim sllx %o5,16,%o7 489238384Sjkim or %o7,%o4,%o4 490238384Sjkim addcc %g1,%o4,%o4 491238384Sjkim srlx %o5,48,%g1 492238384Sjkim bcs,a %xcc,.+8 493238384Sjkim add %g1,1,%g1 494238384Sjkim 495238384Sjkim mov %g1,$carry 496238384Sjkim stx %o4,[$tp] ! tp[num-1]= 497238384Sjkim 498238384Sjkim ba .Louter 499238384Sjkim add $i,8,$i 500238384Sjkim.align 32 501238384Sjkim.Louter: 502238384Sjkim sub %g0,$num,$j ! j=-num 503238384Sjkim add %sp,$bias+$frame+$locals,$tp 504238384Sjkim 505238384Sjkim add $ap,$j,%o3 506238384Sjkim add $bp,$i,%o4 507238384Sjkim 508238384Sjkim ld [%o3+4],%g1 ! bp[i] 509238384Sjkim ld [%o3+0],%o0 510238384Sjkim ld [%o4+4],%g5 ! ap[0] 511238384Sjkim sllx %g1,32,%g1 512238384Sjkim ld [%o4+0],%o1 513238384Sjkim sllx %g5,32,%g5 514238384Sjkim or %g1,%o0,%o0 515238384Sjkim or %g5,%o1,%o1 516238384Sjkim 517238384Sjkim ldx [$tp],%o2 ! tp[0] 518238384Sjkim mulx %o1,%o0,%o0 519238384Sjkim addcc %o2,%o0,%o0 520238384Sjkim mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 521238384Sjkim stx %o0,[%sp+$bias+$frame+0] 522238384Sjkim 523238384Sjkim ! transfer b[i] to FPU as 4x16-bit values 524238384Sjkim ldda [%o4+2]%asi,$ba 525238384Sjkim ldda [%o4+0]%asi,$bb 526238384Sjkim ldda [%o4+6]%asi,$bc 527238384Sjkim ldda [%o4+4]%asi,$bd 528238384Sjkim 529238384Sjkim ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values 530238384Sjkim ldda [%sp+$bias+$frame+6]%asi,$na 531238384Sjkim fxtod $ba,$ba 532238384Sjkim ldda [%sp+$bias+$frame+4]%asi,$nb 533238384Sjkim fxtod $bb,$bb 534238384Sjkim ldda [%sp+$bias+$frame+2]%asi,$nc 535238384Sjkim fxtod $bc,$bc 536238384Sjkim ldda [%sp+$bias+$frame+0]%asi,$nd 537238384Sjkim fxtod $bd,$bd 538238384Sjkim ldd [$ap_l+$j],$alo ! load a[j] in double format 539238384Sjkim fxtod $na,$na 540238384Sjkim ldd [$ap_h+$j],$ahi 541238384Sjkim fxtod $nb,$nb 542238384Sjkim ldd [$np_l+$j],$nlo ! load n[j] in double format 543238384Sjkim fxtod $nc,$nc 544238384Sjkim ldd [$np_h+$j],$nhi 545238384Sjkim fxtod $nd,$nd 546238384Sjkim 547238384Sjkim fmuld $alo,$ba,$aloa 548238384Sjkim fmuld $nlo,$na,$nloa 549238384Sjkim fmuld $alo,$bb,$alob 550238384Sjkim fmuld $nlo,$nb,$nlob 551238384Sjkim fmuld $alo,$bc,$aloc 552238384Sjkim faddd $aloa,$nloa,$nloa 553238384Sjkim fmuld $nlo,$nc,$nloc 554238384Sjkim fmuld $alo,$bd,$alod 555238384Sjkim faddd $alob,$nlob,$nlob 556238384Sjkim fmuld $nlo,$nd,$nlod 557238384Sjkim fmuld $ahi,$ba,$ahia 558238384Sjkim faddd $aloc,$nloc,$nloc 559238384Sjkim fmuld $nhi,$na,$nhia 560238384Sjkim fmuld $ahi,$bb,$ahib 561238384Sjkim faddd $alod,$nlod,$nlod 562238384Sjkim fmuld $nhi,$nb,$nhib 563238384Sjkim fmuld $ahi,$bc,$ahic 564238384Sjkim faddd $ahia,$nhia,$nhia 565238384Sjkim fmuld $nhi,$nc,$nhic 566238384Sjkim fmuld $ahi,$bd,$ahid 567238384Sjkim faddd $ahib,$nhib,$nhib 568238384Sjkim fmuld $nhi,$nd,$nhid 569238384Sjkim 570238384Sjkim faddd $ahic,$nhic,$dota ! $nhic 571238384Sjkim faddd $ahid,$nhid,$dotb ! $nhid 572238384Sjkim 573238384Sjkim faddd $nloc,$nhia,$nloc 574238384Sjkim faddd $nlod,$nhib,$nlod 575238384Sjkim 576238384Sjkim fdtox $nloa,$nloa 577238384Sjkim fdtox $nlob,$nlob 578238384Sjkim fdtox $nloc,$nloc 579238384Sjkim fdtox $nlod,$nlod 580238384Sjkim 581238384Sjkim std $nloa,[%sp+$bias+$frame+0] 582238384Sjkim std $nlob,[%sp+$bias+$frame+8] 583238384Sjkim std $nloc,[%sp+$bias+$frame+16] 584238384Sjkim add $j,8,$j 585238384Sjkim std $nlod,[%sp+$bias+$frame+24] 586238384Sjkim 587238384Sjkim ldd [$ap_l+$j],$alo ! load a[j] in double format 588238384Sjkim ldd [$ap_h+$j],$ahi 589238384Sjkim ldd [$np_l+$j],$nlo ! load n[j] in double format 590238384Sjkim ldd [$np_h+$j],$nhi 591238384Sjkim 592238384Sjkim fmuld $alo,$ba,$aloa 593238384Sjkim fmuld $nlo,$na,$nloa 594238384Sjkim fmuld $alo,$bb,$alob 595238384Sjkim fmuld $nlo,$nb,$nlob 596238384Sjkim fmuld $alo,$bc,$aloc 597238384Sjkim ldx [%sp+$bias+$frame+0],%o0 598238384Sjkim faddd $aloa,$nloa,$nloa 599238384Sjkim fmuld $nlo,$nc,$nloc 600238384Sjkim ldx [%sp+$bias+$frame+8],%o1 601238384Sjkim fmuld $alo,$bd,$alod 602238384Sjkim ldx [%sp+$bias+$frame+16],%o2 603238384Sjkim faddd $alob,$nlob,$nlob 604238384Sjkim fmuld $nlo,$nd,$nlod 605238384Sjkim ldx [%sp+$bias+$frame+24],%o3 606238384Sjkim fmuld $ahi,$ba,$ahia 607238384Sjkim 608238384Sjkim srlx %o0,16,%o7 609238384Sjkim faddd $aloc,$nloc,$nloc 610238384Sjkim fmuld $nhi,$na,$nhia 611238384Sjkim add %o7,%o1,%o1 612238384Sjkim fmuld $ahi,$bb,$ahib 613238384Sjkim srlx %o1,16,%o7 614238384Sjkim faddd $alod,$nlod,$nlod 615238384Sjkim fmuld $nhi,$nb,$nhib 616238384Sjkim add %o7,%o2,%o2 617238384Sjkim fmuld $ahi,$bc,$ahic 618238384Sjkim srlx %o2,16,%o7 619238384Sjkim faddd $ahia,$nhia,$nhia 620238384Sjkim fmuld $nhi,$nc,$nhic 621238384Sjkim add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 622238384Sjkim ! why? 623238384Sjkim and %o0,$mask,%o0 624238384Sjkim fmuld $ahi,$bd,$ahid 625238384Sjkim and %o1,$mask,%o1 626238384Sjkim and %o2,$mask,%o2 627238384Sjkim faddd $ahib,$nhib,$nhib 628238384Sjkim fmuld $nhi,$nd,$nhid 629238384Sjkim sllx %o1,16,%o1 630238384Sjkim faddd $dota,$nloa,$nloa 631238384Sjkim sllx %o2,32,%o2 632238384Sjkim faddd $dotb,$nlob,$nlob 633238384Sjkim sllx %o3,48,%o7 634238384Sjkim or %o1,%o0,%o0 635238384Sjkim faddd $ahic,$nhic,$dota ! $nhic 636238384Sjkim or %o2,%o0,%o0 637238384Sjkim faddd $ahid,$nhid,$dotb ! $nhid 638238384Sjkim or %o7,%o0,%o0 ! 64-bit result 639238384Sjkim ldx [$tp],%o7 640238384Sjkim faddd $nloc,$nhia,$nloc 641238384Sjkim addcc %o7,%o0,%o0 642238384Sjkim ! end-of-why? 643238384Sjkim faddd $nlod,$nhib,$nlod 644238384Sjkim srlx %o3,16,%g1 ! 34-bit carry 645238384Sjkim fdtox $nloa,$nloa 646238384Sjkim bcs,a %xcc,.+8 647238384Sjkim add %g1,1,%g1 648238384Sjkim 649238384Sjkim fdtox $nlob,$nlob 650238384Sjkim fdtox $nloc,$nloc 651238384Sjkim fdtox $nlod,$nlod 652238384Sjkim 653238384Sjkim std $nloa,[%sp+$bias+$frame+0] 654238384Sjkim std $nlob,[%sp+$bias+$frame+8] 655238384Sjkim addcc $j,8,$j 656238384Sjkim std $nloc,[%sp+$bias+$frame+16] 657238384Sjkim bz,pn %icc,.Linnerskip 658238384Sjkim std $nlod,[%sp+$bias+$frame+24] 659238384Sjkim 660238384Sjkim ba .Linner 661238384Sjkim nop 662238384Sjkim.align 32 663238384Sjkim.Linner: 664238384Sjkim ldd [$ap_l+$j],$alo ! load a[j] in double format 665238384Sjkim ldd [$ap_h+$j],$ahi 666238384Sjkim ldd [$np_l+$j],$nlo ! load n[j] in double format 667238384Sjkim ldd [$np_h+$j],$nhi 668238384Sjkim 669238384Sjkim fmuld $alo,$ba,$aloa 670238384Sjkim fmuld $nlo,$na,$nloa 671238384Sjkim fmuld $alo,$bb,$alob 672238384Sjkim fmuld $nlo,$nb,$nlob 673238384Sjkim fmuld $alo,$bc,$aloc 674238384Sjkim ldx [%sp+$bias+$frame+0],%o0 675238384Sjkim faddd $aloa,$nloa,$nloa 676238384Sjkim fmuld $nlo,$nc,$nloc 677238384Sjkim ldx [%sp+$bias+$frame+8],%o1 678238384Sjkim fmuld $alo,$bd,$alod 679238384Sjkim ldx [%sp+$bias+$frame+16],%o2 680238384Sjkim faddd $alob,$nlob,$nlob 681238384Sjkim fmuld $nlo,$nd,$nlod 682238384Sjkim ldx [%sp+$bias+$frame+24],%o3 683238384Sjkim fmuld $ahi,$ba,$ahia 684238384Sjkim 685238384Sjkim srlx %o0,16,%o7 686238384Sjkim faddd $aloc,$nloc,$nloc 687238384Sjkim fmuld $nhi,$na,$nhia 688238384Sjkim add %o7,%o1,%o1 689238384Sjkim fmuld $ahi,$bb,$ahib 690238384Sjkim srlx %o1,16,%o7 691238384Sjkim faddd $alod,$nlod,$nlod 692238384Sjkim fmuld $nhi,$nb,$nhib 693238384Sjkim add %o7,%o2,%o2 694238384Sjkim fmuld $ahi,$bc,$ahic 695238384Sjkim srlx %o2,16,%o7 696238384Sjkim faddd $ahia,$nhia,$nhia 697238384Sjkim fmuld $nhi,$nc,$nhic 698238384Sjkim add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 699238384Sjkim and %o0,$mask,%o0 700238384Sjkim fmuld $ahi,$bd,$ahid 701238384Sjkim and %o1,$mask,%o1 702238384Sjkim and %o2,$mask,%o2 703238384Sjkim faddd $ahib,$nhib,$nhib 704238384Sjkim fmuld $nhi,$nd,$nhid 705238384Sjkim sllx %o1,16,%o1 706238384Sjkim faddd $dota,$nloa,$nloa 707238384Sjkim sllx %o2,32,%o2 708238384Sjkim faddd $dotb,$nlob,$nlob 709238384Sjkim sllx %o3,48,%o7 710238384Sjkim or %o1,%o0,%o0 711238384Sjkim faddd $ahic,$nhic,$dota ! $nhic 712238384Sjkim or %o2,%o0,%o0 713238384Sjkim faddd $ahid,$nhid,$dotb ! $nhid 714238384Sjkim or %o7,%o0,%o0 ! 64-bit result 715238384Sjkim faddd $nloc,$nhia,$nloc 716238384Sjkim addcc %g1,%o0,%o0 717238384Sjkim ldx [$tp+8],%o7 ! tp[j] 718238384Sjkim faddd $nlod,$nhib,$nlod 719238384Sjkim srlx %o3,16,%g1 ! 34-bit carry 720238384Sjkim fdtox $nloa,$nloa 721238384Sjkim bcs,a %xcc,.+8 722238384Sjkim add %g1,1,%g1 723238384Sjkim fdtox $nlob,$nlob 724238384Sjkim addcc %o7,%o0,%o0 725238384Sjkim fdtox $nloc,$nloc 726238384Sjkim bcs,a %xcc,.+8 727238384Sjkim add %g1,1,%g1 728238384Sjkim 729238384Sjkim stx %o0,[$tp] ! tp[j-1] 730238384Sjkim fdtox $nlod,$nlod 731238384Sjkim 732238384Sjkim std $nloa,[%sp+$bias+$frame+0] 733238384Sjkim std $nlob,[%sp+$bias+$frame+8] 734238384Sjkim std $nloc,[%sp+$bias+$frame+16] 735238384Sjkim addcc $j,8,$j 736238384Sjkim std $nlod,[%sp+$bias+$frame+24] 737238384Sjkim bnz,pt %icc,.Linner 738238384Sjkim add $tp,8,$tp 739238384Sjkim 740238384Sjkim.Linnerskip: 741238384Sjkim fdtox $dota,$dota 742238384Sjkim fdtox $dotb,$dotb 743238384Sjkim 744238384Sjkim ldx [%sp+$bias+$frame+0],%o0 745238384Sjkim ldx [%sp+$bias+$frame+8],%o1 746238384Sjkim ldx [%sp+$bias+$frame+16],%o2 747238384Sjkim ldx [%sp+$bias+$frame+24],%o3 748238384Sjkim 749238384Sjkim srlx %o0,16,%o7 750238384Sjkim std $dota,[%sp+$bias+$frame+32] 751238384Sjkim add %o7,%o1,%o1 752238384Sjkim std $dotb,[%sp+$bias+$frame+40] 753238384Sjkim srlx %o1,16,%o7 754238384Sjkim add %o7,%o2,%o2 755238384Sjkim srlx %o2,16,%o7 756238384Sjkim add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 757238384Sjkim and %o0,$mask,%o0 758238384Sjkim and %o1,$mask,%o1 759238384Sjkim and %o2,$mask,%o2 760238384Sjkim sllx %o1,16,%o1 761238384Sjkim sllx %o2,32,%o2 762238384Sjkim sllx %o3,48,%o7 763238384Sjkim or %o1,%o0,%o0 764238384Sjkim or %o2,%o0,%o0 765238384Sjkim ldx [%sp+$bias+$frame+32],%o4 766238384Sjkim or %o7,%o0,%o0 ! 64-bit result 767238384Sjkim ldx [%sp+$bias+$frame+40],%o5 768238384Sjkim addcc %g1,%o0,%o0 769238384Sjkim ldx [$tp+8],%o7 ! tp[j] 770238384Sjkim srlx %o3,16,%g1 ! 34-bit carry 771238384Sjkim bcs,a %xcc,.+8 772238384Sjkim add %g1,1,%g1 773238384Sjkim 774238384Sjkim addcc %o7,%o0,%o0 775238384Sjkim bcs,a %xcc,.+8 776238384Sjkim add %g1,1,%g1 777238384Sjkim 778238384Sjkim stx %o0,[$tp] ! tp[j-1] 779238384Sjkim add $tp,8,$tp 780238384Sjkim 781238384Sjkim srlx %o4,16,%o7 782238384Sjkim add %o7,%o5,%o5 783238384Sjkim and %o4,$mask,%o4 784238384Sjkim sllx %o5,16,%o7 785238384Sjkim or %o7,%o4,%o4 786238384Sjkim addcc %g1,%o4,%o4 787238384Sjkim srlx %o5,48,%g1 788238384Sjkim bcs,a %xcc,.+8 789238384Sjkim add %g1,1,%g1 790238384Sjkim 791238384Sjkim addcc $carry,%o4,%o4 792238384Sjkim stx %o4,[$tp] ! tp[num-1] 793238384Sjkim mov %g1,$carry 794238384Sjkim bcs,a %xcc,.+8 795238384Sjkim add $carry,1,$carry 796238384Sjkim 797238384Sjkim addcc $i,8,$i 798238384Sjkim bnz %icc,.Louter 799238384Sjkim nop 800238384Sjkim 801238384Sjkim add $tp,8,$tp ! adjust tp to point at the end 802238384Sjkim orn %g0,%g0,%g4 803238384Sjkim sub %g0,$num,%o7 ! n=-num 804238384Sjkim ba .Lsub 805238384Sjkim subcc %g0,%g0,%g0 ! clear %icc.c 806238384Sjkim 807238384Sjkim.align 32 808238384Sjkim.Lsub: 809238384Sjkim ldx [$tp+%o7],%o0 810238384Sjkim add $np,%o7,%g1 811238384Sjkim ld [%g1+0],%o2 812238384Sjkim ld [%g1+4],%o3 813238384Sjkim srlx %o0,32,%o1 814238384Sjkim subccc %o0,%o2,%o2 815238384Sjkim add $rp,%o7,%g1 816238384Sjkim subccc %o1,%o3,%o3 817238384Sjkim st %o2,[%g1+0] 818238384Sjkim add %o7,8,%o7 819238384Sjkim brnz,pt %o7,.Lsub 820238384Sjkim st %o3,[%g1+4] 821238384Sjkim subc $carry,0,%g4 822238384Sjkim sub %g0,$num,%o7 ! n=-num 823238384Sjkim ba .Lcopy 824238384Sjkim nop 825238384Sjkim 826238384Sjkim.align 32 827238384Sjkim.Lcopy: 828238384Sjkim ldx [$tp+%o7],%o0 829238384Sjkim add $rp,%o7,%g1 830238384Sjkim ld [%g1+0],%o2 831238384Sjkim ld [%g1+4],%o3 832238384Sjkim stx %g0,[$tp+%o7] 833238384Sjkim and %o0,%g4,%o0 834238384Sjkim srlx %o0,32,%o1 835238384Sjkim andn %o2,%g4,%o2 836238384Sjkim andn %o3,%g4,%o3 837238384Sjkim or %o2,%o0,%o0 838238384Sjkim or %o3,%o1,%o1 839238384Sjkim st %o0,[%g1+0] 840238384Sjkim add %o7,8,%o7 841238384Sjkim brnz,pt %o7,.Lcopy 842238384Sjkim st %o1,[%g1+4] 843238384Sjkim sub %g0,$num,%o7 ! n=-num 844238384Sjkim 845238384Sjkim.Lzap: 846238384Sjkim stx %g0,[$ap_l+%o7] 847238384Sjkim stx %g0,[$ap_h+%o7] 848238384Sjkim stx %g0,[$np_l+%o7] 849238384Sjkim stx %g0,[$np_h+%o7] 850238384Sjkim add %o7,8,%o7 851238384Sjkim brnz,pt %o7,.Lzap 852238384Sjkim nop 853238384Sjkim 854238384Sjkim ldx [%sp+$bias+$frame+48],%o7 855238384Sjkim wr %g0,%o7,%asi ! restore %asi 856238384Sjkim 857238384Sjkim mov 1,%i0 858238384Sjkim.Lret: 859238384Sjkim ret 860238384Sjkim restore 861238384Sjkim.type $fname,#function 862238384Sjkim.size $fname,(.-$fname) 863238384Sjkim.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" 864238384Sjkim.align 32 865238384Sjkim___ 866238384Sjkim 867238384Sjkim$code =~ s/\`([^\`]*)\`/eval($1)/gem; 868238384Sjkim 869238384Sjkim# Below substitution makes it possible to compile without demanding 870238384Sjkim# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I 871238384Sjkim# dare to do this, because VIS capability is detected at run-time now 872238384Sjkim# and this routine is not called on CPU not capable to execute it. Do 873238384Sjkim# note that fzeros is not the only VIS dependency! Another dependency 874238384Sjkim# is implicit and is just _a_ numerical value loaded to %asi register, 875238384Sjkim# which assembler can't recognize as VIS specific... 876238384Sjkim$code =~ s/fzeros\s+%f([0-9]+)/ 877238384Sjkim sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) 878238384Sjkim /gem; 879238384Sjkim 880238384Sjkimprint $code; 881238384Sjkim# flush 882238384Sjkimclose STDOUT; 883