168651Skris; 268651Skris; PA-RISC 64-bit implementation of bn_asm code 368651Skris; 468651Skris; This code is approximately 2x faster than the C version 568651Skris; for RSA/DSA. 668651Skris; 768651Skris; See http://devresource.hp.com/ for more details on the PA-RISC 868651Skris; architecture. Also see the book "PA-RISC 2.0 Architecture" 968651Skris; by Gerry Kane for information on the instruction set architecture. 1068651Skris; 1168651Skris; Code written by Chris Ruemmler (with some help from the HP C 1268651Skris; compiler). 1368651Skris; 1468651Skris; The code compiles with HP's assembler 1568651Skris; 1668651Skris 1768651Skris .level 2.0W 1868651Skris .space $TEXT$ 1968651Skris .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY 2068651Skris 2168651Skris; 2268651Skris; Global Register definitions used for the routines. 2368651Skris; 2468651Skris; Some information about HP's runtime architecture for 64-bits. 2568651Skris; 2668651Skris; "Caller save" means the calling function must save the register 2768651Skris; if it wants the register to be preserved. 2868651Skris; "Callee save" means if a function uses the register, it must save 2968651Skris; the value before using it. 3068651Skris; 3168651Skris; For the floating point registers 3268651Skris; 3368651Skris; "caller save" registers: fr4-fr11, fr22-fr31 3468651Skris; "callee save" registers: fr12-fr21 3568651Skris; "special" registers: fr0-fr3 (status and exception registers) 3668651Skris; 3768651Skris; For the integer registers 3868651Skris; value zero : r0 3968651Skris; "caller save" registers: r1,r19-r26 4068651Skris; "callee save" registers: r3-r18 4168651Skris; return register : r2 (rp) 4268651Skris; return values ; r28 (ret0,ret1) 4368651Skris; Stack pointer ; r30 (sp) 4468651Skris; global data pointer ; r27 (dp) 4568651Skris; argument pointer ; r29 (ap) 4668651Skris; millicode return ptr ; r31 (also a caller save register) 4768651Skris 4868651Skris 4968651Skris; 5068651Skris; Arguments to the routines 5168651Skris; 5268651Skrisr_ptr .reg %r26 5368651Skrisa_ptr .reg %r25 5468651Skrisb_ptr .reg %r24 5568651Skrisnum .reg %r24 5668651Skrisw .reg %r23 5768651Skrisn .reg %r23 5868651Skris 5968651Skris 6068651Skris; 6168651Skris; Globals used in some routines 6268651Skris; 6368651Skris 6468651Skristop_overflow .reg %r29 6568651Skrishigh_mask .reg %r22 ; value 0xffffffff80000000L 6668651Skris 6768651Skris 6868651Skris;------------------------------------------------------------------------------ 6968651Skris; 7068651Skris; bn_mul_add_words 7168651Skris; 7268651Skris;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, 7368651Skris; int num, BN_ULONG w) 7468651Skris; 7568651Skris; arg0 = r_ptr 7668651Skris; arg1 = a_ptr 7768651Skris; arg2 = num 7868651Skris; arg3 = w 7968651Skris; 8068651Skris; Local register definitions 8168651Skris; 8268651Skris 8368651Skrisfm1 .reg %fr22 8468651Skrisfm .reg %fr23 8568651Skrisht_temp .reg %fr24 8668651Skrisht_temp_1 .reg %fr25 8768651Skrislt_temp .reg %fr26 8868651Skrislt_temp_1 .reg %fr27 8968651Skrisfm1_1 .reg %fr28 9068651Skrisfm_1 .reg %fr29 9168651Skris 9268651Skrisfw_h .reg %fr7L 9368651Skrisfw_l .reg %fr7R 9468651Skrisfw .reg %fr7 9568651Skris 9668651Skrisfht_0 .reg %fr8L 9768651Skrisflt_0 .reg %fr8R 9868651Skrist_float_0 .reg %fr8 9968651Skris 10068651Skrisfht_1 .reg %fr9L 10168651Skrisflt_1 .reg %fr9R 10268651Skrist_float_1 .reg %fr9 10368651Skris 10468651Skristmp_0 .reg %r31 10568651Skristmp_1 .reg %r21 10668651Skrism_0 .reg %r20 10768651Skrism_1 .reg %r19 10868651Skrisht_0 .reg %r1 10968651Skrisht_1 .reg %r3 11068651Skrislt_0 .reg %r4 11168651Skrislt_1 .reg %r5 11268651Skrism1_0 .reg %r6 11368651Skrism1_1 .reg %r7 11468651Skrisrp_val .reg %r8 11568651Skrisrp_val_1 .reg %r9 11668651Skris 11768651Skrisbn_mul_add_words 11868651Skris .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN 11968651Skris .proc 12068651Skris .callinfo frame=128 12168651Skris .entry 12268651Skris .align 64 12368651Skris 12468651Skris STD %r3,0(%sp) ; save r3 12568651Skris STD %r4,8(%sp) ; save r4 12668651Skris NOP ; Needed to make the loop 16-byte aligned 12768651Skris NOP ; Needed to make the loop 16-byte aligned 12868651Skris 12968651Skris STD %r5,16(%sp) ; save r5 13068651Skris STD %r6,24(%sp) ; save r6 13168651Skris STD %r7,32(%sp) ; save r7 13268651Skris STD %r8,40(%sp) ; save r8 13368651Skris 13468651Skris STD %r9,48(%sp) ; save r9 13568651Skris COPY %r0,%ret0 ; return 0 by default 13668651Skris DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 13768651Skris STD w,56(%sp) ; store w on stack 13868651Skris 13968651Skris CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit 14068651Skris LDO 128(%sp),%sp ; bump stack 14168651Skris 14268651Skris ; 14368651Skris ; The loop is unrolled twice, so if there is only 1 number 14468651Skris ; then go straight to the cleanup code. 14568651Skris ; 14668651Skris CMPIB,= 1,num,bn_mul_add_words_single_top 14768651Skris FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) 14868651Skris 14968651Skris ; 15068651Skris ; This loop is unrolled 2 times (64-byte aligned as well) 15168651Skris ; 15268651Skris ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus 15368651Skris ; two 32-bit mutiplies can be issued per cycle. 15468651Skris ; 15568651Skrisbn_mul_add_words_unroll2 15668651Skris 15768651Skris FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 15868651Skris FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) 15968651Skris LDD 0(r_ptr),rp_val ; rp[0] 16068651Skris LDD 8(r_ptr),rp_val_1 ; rp[1] 16168651Skris 16268651Skris XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l 16368651Skris XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l 16468651Skris FSTD fm1,-16(%sp) ; -16(sp) = m1[0] 16568651Skris FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] 16668651Skris 16768651Skris XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h 16868651Skris XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h 16968651Skris FSTD fm,-8(%sp) ; -8(sp) = m[0] 17068651Skris FSTD fm_1,-40(%sp) ; -40(sp) = m[1] 17168651Skris 17268651Skris XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h 17368651Skris XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h 17468651Skris FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp 17568651Skris FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 17668651Skris 17768651Skris XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l 17868651Skris XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l 17968651Skris FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp 18068651Skris FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 18168651Skris 18268651Skris LDD -8(%sp),m_0 ; m[0] 18368651Skris LDD -40(%sp),m_1 ; m[1] 18468651Skris LDD -16(%sp),m1_0 ; m1[0] 18568651Skris LDD -48(%sp),m1_1 ; m1[1] 18668651Skris 18768651Skris LDD -24(%sp),ht_0 ; ht[0] 18868651Skris LDD -56(%sp),ht_1 ; ht[1] 18968651Skris ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; 19068651Skris ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; 19168651Skris 19268651Skris LDD -32(%sp),lt_0 19368651Skris LDD -64(%sp),lt_1 19468651Skris CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) 19568651Skris ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) 19668651Skris 19768651Skris CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) 19868651Skris ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) 19968651Skris EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 20068651Skris DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 20168651Skris 20268651Skris EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 20368651Skris DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 20468651Skris ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) 20568651Skris ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) 20668651Skris 20768651Skris ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; 20868651Skris ADD,DC ht_0,%r0,ht_0 ; ht[0]++ 20968651Skris ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; 21068651Skris ADD,DC ht_1,%r0,ht_1 ; ht[1]++ 21168651Skris 21268651Skris ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c; 21368651Skris ADD,DC ht_0,%r0,ht_0 ; ht[0]++ 21468651Skris ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] 21568651Skris ADD,DC ht_0,%r0,ht_0 ; ht[0]++ 21668651Skris 21768651Skris LDO -2(num),num ; num = num - 2; 21868651Skris ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); 21968651Skris ADD,DC ht_1,%r0,ht_1 ; ht[1]++ 22068651Skris STD lt_0,0(r_ptr) ; rp[0] = lt[0] 22168651Skris 22268651Skris ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] 22368651Skris ADD,DC ht_1,%r0,%ret0 ; ht[1]++ 22468651Skris LDO 16(a_ptr),a_ptr ; a_ptr += 2 22568651Skris 22668651Skris STD lt_1,8(r_ptr) ; rp[1] = lt[1] 22768651Skris CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do 22868651Skris LDO 16(r_ptr),r_ptr ; r_ptr += 2 22968651Skris 23068651Skris CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one 23168651Skris 23268651Skris ; 23368651Skris ; Top of loop aligned on 64-byte boundary 23468651Skris ; 23568651Skrisbn_mul_add_words_single_top 23668651Skris FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 23768651Skris LDD 0(r_ptr),rp_val ; rp[0] 23868651Skris LDO 8(a_ptr),a_ptr ; a_ptr++ 23968651Skris XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l 24068651Skris FSTD fm1,-16(%sp) ; -16(sp) = m1 24168651Skris XMPYU flt_0,fw_h,fm ; m = lt*fw_h 24268651Skris FSTD fm,-8(%sp) ; -8(sp) = m 24368651Skris XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h 24468651Skris FSTD ht_temp,-24(%sp) ; -24(sp) = ht 24568651Skris XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l 24668651Skris FSTD lt_temp,-32(%sp) ; -32(sp) = lt 24768651Skris 24868651Skris LDD -8(%sp),m_0 24968651Skris LDD -16(%sp),m1_0 ; m1 = temp1 25068651Skris ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; 25168651Skris LDD -24(%sp),ht_0 25268651Skris LDD -32(%sp),lt_0 25368651Skris 25468651Skris CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) 25568651Skris ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) 25668651Skris 25768651Skris EXTRD,U tmp_0,31,32,m_0 ; m>>32 25868651Skris DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 25968651Skris 26068651Skris ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) 26168651Skris ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; 26268651Skris ADD,DC ht_0,%r0,ht_0 ; ht++ 26368651Skris ADD %ret0,tmp_0,lt_0 ; lt = lt + c; 26468651Skris ADD,DC ht_0,%r0,ht_0 ; ht++ 26568651Skris ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] 26668651Skris ADD,DC ht_0,%r0,%ret0 ; ht++ 26768651Skris STD lt_0,0(r_ptr) ; rp[0] = lt 26868651Skris 26968651Skrisbn_mul_add_words_exit 27068651Skris .EXIT 27168651Skris LDD -80(%sp),%r9 ; restore r9 27268651Skris LDD -88(%sp),%r8 ; restore r8 27368651Skris LDD -96(%sp),%r7 ; restore r7 27468651Skris LDD -104(%sp),%r6 ; restore r6 27568651Skris LDD -112(%sp),%r5 ; restore r5 27668651Skris LDD -120(%sp),%r4 ; restore r4 27768651Skris BVE (%rp) 27868651Skris LDD,MB -128(%sp),%r3 ; restore r3 27968651Skris .PROCEND ;in=23,24,25,26,29;out=28; 28068651Skris 28168651Skris;---------------------------------------------------------------------------- 28268651Skris; 28368651Skris;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 28468651Skris; 28568651Skris; arg0 = rp 28668651Skris; arg1 = ap 28768651Skris; arg2 = num 28868651Skris; arg3 = w 28968651Skris 29068651Skrisbn_mul_words 29168651Skris .proc 29268651Skris .callinfo frame=128 29368651Skris .entry 29468651Skris .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 29568651Skris .align 64 29668651Skris 29768651Skris STD %r3,0(%sp) ; save r3 29868651Skris STD %r4,8(%sp) ; save r4 29968651Skris STD %r5,16(%sp) ; save r5 30068651Skris STD %r6,24(%sp) ; save r6 30168651Skris 30268651Skris STD %r7,32(%sp) ; save r7 30368651Skris COPY %r0,%ret0 ; return 0 by default 30468651Skris DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 30568651Skris STD w,56(%sp) ; w on stack 30668651Skris 30768651Skris CMPIB,>= 0,num,bn_mul_words_exit 30868651Skris LDO 128(%sp),%sp ; bump stack 30968651Skris 31068651Skris ; 31168651Skris ; See if only 1 word to do, thus just do cleanup 31268651Skris ; 31368651Skris CMPIB,= 1,num,bn_mul_words_single_top 31468651Skris FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) 31568651Skris 31668651Skris ; 31768651Skris ; This loop is unrolled 2 times (64-byte aligned as well) 31868651Skris ; 31968651Skris ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus 32068651Skris ; two 32-bit mutiplies can be issued per cycle. 32168651Skris ; 32268651Skrisbn_mul_words_unroll2 32368651Skris 32468651Skris FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 32568651Skris FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) 32668651Skris XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l 32768651Skris XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l 32868651Skris 32968651Skris FSTD fm1,-16(%sp) ; -16(sp) = m1 33068651Skris FSTD fm1_1,-48(%sp) ; -48(sp) = m1 33168651Skris XMPYU flt_0,fw_h,fm ; m = lt*fw_h 33268651Skris XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h 33368651Skris 33468651Skris FSTD fm,-8(%sp) ; -8(sp) = m 33568651Skris FSTD fm_1,-40(%sp) ; -40(sp) = m 33668651Skris XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h 33768651Skris XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h 33868651Skris 33968651Skris FSTD ht_temp,-24(%sp) ; -24(sp) = ht 34068651Skris FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht 34168651Skris XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l 34268651Skris XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l 34368651Skris 34468651Skris FSTD lt_temp,-32(%sp) ; -32(sp) = lt 34568651Skris FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt 34668651Skris LDD -8(%sp),m_0 34768651Skris LDD -40(%sp),m_1 34868651Skris 34968651Skris LDD -16(%sp),m1_0 35068651Skris LDD -48(%sp),m1_1 35168651Skris LDD -24(%sp),ht_0 35268651Skris LDD -56(%sp),ht_1 35368651Skris 35468651Skris ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; 35568651Skris ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; 35668651Skris LDD -32(%sp),lt_0 35768651Skris LDD -64(%sp),lt_1 35868651Skris 35968651Skris CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) 36068651Skris ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) 36168651Skris CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) 36268651Skris ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) 36368651Skris 36468651Skris EXTRD,U tmp_0,31,32,m_0 ; m>>32 36568651Skris DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 36668651Skris EXTRD,U tmp_1,31,32,m_1 ; m>>32 36768651Skris DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 36868651Skris 36968651Skris ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) 37068651Skris ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) 37168651Skris ADD lt_0,m1_0,lt_0 ; lt = lt+m1; 37268651Skris ADD,DC ht_0,%r0,ht_0 ; ht++ 37368651Skris 37468651Skris ADD lt_1,m1_1,lt_1 ; lt = lt+m1; 37568651Skris ADD,DC ht_1,%r0,ht_1 ; ht++ 37668651Skris ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0); 37768651Skris ADD,DC ht_0,%r0,ht_0 ; ht++ 37868651Skris 37968651Skris ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) 38068651Skris ADD,DC ht_1,%r0,ht_1 ; ht++ 38168651Skris STD lt_0,0(r_ptr) ; rp[0] = lt 38268651Skris STD lt_1,8(r_ptr) ; rp[1] = lt 38368651Skris 38468651Skris COPY ht_1,%ret0 ; carry = ht 38568651Skris LDO -2(num),num ; num = num - 2; 38668651Skris LDO 16(a_ptr),a_ptr ; ap += 2 38768651Skris CMPIB,<= 2,num,bn_mul_words_unroll2 38868651Skris LDO 16(r_ptr),r_ptr ; rp++ 38968651Skris 39068651Skris CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? 39168651Skris 39268651Skris ; 39368651Skris ; Top of loop aligned on 64-byte boundary 39468651Skris ; 39568651Skrisbn_mul_words_single_top 39668651Skris FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 39768651Skris 39868651Skris XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l 39968651Skris FSTD fm1,-16(%sp) ; -16(sp) = m1 40068651Skris XMPYU flt_0,fw_h,fm ; m = lt*fw_h 40168651Skris FSTD fm,-8(%sp) ; -8(sp) = m 40268651Skris XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h 40368651Skris FSTD ht_temp,-24(%sp) ; -24(sp) = ht 40468651Skris XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l 40568651Skris FSTD lt_temp,-32(%sp) ; -32(sp) = lt 40668651Skris 40768651Skris LDD -8(%sp),m_0 40868651Skris LDD -16(%sp),m1_0 40968651Skris ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; 41068651Skris LDD -24(%sp),ht_0 41168651Skris LDD -32(%sp),lt_0 41268651Skris 41368651Skris CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) 41468651Skris ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) 41568651Skris 41668651Skris EXTRD,U tmp_0,31,32,m_0 ; m>>32 41768651Skris DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 41868651Skris 41968651Skris ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) 42068651Skris ADD lt_0,m1_0,lt_0 ; lt= lt+m1; 42168651Skris ADD,DC ht_0,%r0,ht_0 ; ht++ 42268651Skris 42368651Skris ADD %ret0,lt_0,lt_0 ; lt = lt + c; 42468651Skris ADD,DC ht_0,%r0,ht_0 ; ht++ 42568651Skris 42668651Skris COPY ht_0,%ret0 ; copy carry 42768651Skris STD lt_0,0(r_ptr) ; rp[0] = lt 42868651Skris 42968651Skrisbn_mul_words_exit 43068651Skris .EXIT 43168651Skris LDD -96(%sp),%r7 ; restore r7 43268651Skris LDD -104(%sp),%r6 ; restore r6 43368651Skris LDD -112(%sp),%r5 ; restore r5 43468651Skris LDD -120(%sp),%r4 ; restore r4 43568651Skris BVE (%rp) 43668651Skris LDD,MB -128(%sp),%r3 ; restore r3 43768651Skris .PROCEND ;in=23,24,25,26,29;out=28; 43868651Skris 43968651Skris;---------------------------------------------------------------------------- 44068651Skris; 44168651Skris;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) 44268651Skris; 44368651Skris; arg0 = rp 44468651Skris; arg1 = ap 44568651Skris; arg2 = num 44668651Skris; 44768651Skris 44868651Skrisbn_sqr_words 44968651Skris .proc 45068651Skris .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 45168651Skris .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 45268651Skris .entry 45368651Skris .align 64 45468651Skris 45568651Skris STD %r3,0(%sp) ; save r3 45668651Skris STD %r4,8(%sp) ; save r4 45768651Skris NOP 45868651Skris STD %r5,16(%sp) ; save r5 45968651Skris 46068651Skris CMPIB,>= 0,num,bn_sqr_words_exit 46168651Skris LDO 128(%sp),%sp ; bump stack 46268651Skris 46368651Skris ; 46468651Skris ; If only 1, the goto straight to cleanup 46568651Skris ; 46668651Skris CMPIB,= 1,num,bn_sqr_words_single_top 46768651Skris DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L 46868651Skris 46968651Skris ; 47068651Skris ; This loop is unrolled 2 times (64-byte aligned as well) 47168651Skris ; 47268651Skris 47368651Skrisbn_sqr_words_unroll2 47468651Skris FLDD 0(a_ptr),t_float_0 ; a[0] 47568651Skris FLDD 8(a_ptr),t_float_1 ; a[1] 47668651Skris XMPYU fht_0,flt_0,fm ; m[0] 47768651Skris XMPYU fht_1,flt_1,fm_1 ; m[1] 47868651Skris 47968651Skris FSTD fm,-24(%sp) ; store m[0] 48068651Skris FSTD fm_1,-56(%sp) ; store m[1] 48168651Skris XMPYU flt_0,flt_0,lt_temp ; lt[0] 48268651Skris XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] 48368651Skris 48468651Skris FSTD lt_temp,-16(%sp) ; store lt[0] 48568651Skris FSTD lt_temp_1,-48(%sp) ; store lt[1] 48668651Skris XMPYU fht_0,fht_0,ht_temp ; ht[0] 48768651Skris XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] 48868651Skris 48968651Skris FSTD ht_temp,-8(%sp) ; store ht[0] 49068651Skris FSTD ht_temp_1,-40(%sp) ; store ht[1] 49168651Skris LDD -24(%sp),m_0 49268651Skris LDD -56(%sp),m_1 49368651Skris 49468651Skris AND m_0,high_mask,tmp_0 ; m[0] & Mask 49568651Skris AND m_1,high_mask,tmp_1 ; m[1] & Mask 49668651Skris DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 49768651Skris DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 49868651Skris 49968651Skris LDD -16(%sp),lt_0 50068651Skris LDD -48(%sp),lt_1 50168651Skris EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 50268651Skris EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 50368651Skris 50468651Skris LDD -8(%sp),ht_0 50568651Skris LDD -40(%sp),ht_1 50668651Skris ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 50768651Skris ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 50868651Skris 50968651Skris ADD lt_0,m_0,lt_0 ; lt = lt+m 51068651Skris ADD,DC ht_0,%r0,ht_0 ; ht[0]++ 51168651Skris STD lt_0,0(r_ptr) ; rp[0] = lt[0] 51268651Skris STD ht_0,8(r_ptr) ; rp[1] = ht[1] 51368651Skris 51468651Skris ADD lt_1,m_1,lt_1 ; lt = lt+m 51568651Skris ADD,DC ht_1,%r0,ht_1 ; ht[1]++ 51668651Skris STD lt_1,16(r_ptr) ; rp[2] = lt[1] 51768651Skris STD ht_1,24(r_ptr) ; rp[3] = ht[1] 51868651Skris 51968651Skris LDO -2(num),num ; num = num - 2; 52068651Skris LDO 16(a_ptr),a_ptr ; ap += 2 52168651Skris CMPIB,<= 2,num,bn_sqr_words_unroll2 52268651Skris LDO 32(r_ptr),r_ptr ; rp += 4 52368651Skris 52468651Skris CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? 52568651Skris 52668651Skris ; 52768651Skris ; Top of loop aligned on 64-byte boundary 52868651Skris ; 52968651Skrisbn_sqr_words_single_top 53068651Skris FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 53168651Skris 53268651Skris XMPYU fht_0,flt_0,fm ; m 53368651Skris FSTD fm,-24(%sp) ; store m 53468651Skris 53568651Skris XMPYU flt_0,flt_0,lt_temp ; lt 53668651Skris FSTD lt_temp,-16(%sp) ; store lt 53768651Skris 53868651Skris XMPYU fht_0,fht_0,ht_temp ; ht 53968651Skris FSTD ht_temp,-8(%sp) ; store ht 54068651Skris 54168651Skris LDD -24(%sp),m_0 ; load m 54268651Skris AND m_0,high_mask,tmp_0 ; m & Mask 54368651Skris DEPD,Z m_0,30,31,m_0 ; m << 32+1 54468651Skris LDD -16(%sp),lt_0 ; lt 54568651Skris 54668651Skris LDD -8(%sp),ht_0 ; ht 54768651Skris EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 54868651Skris ADD m_0,lt_0,lt_0 ; lt = lt+m 54968651Skris ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 55068651Skris ADD,DC ht_0,%r0,ht_0 ; ht++ 55168651Skris 55268651Skris STD lt_0,0(r_ptr) ; rp[0] = lt 55368651Skris STD ht_0,8(r_ptr) ; rp[1] = ht 55468651Skris 55568651Skrisbn_sqr_words_exit 55668651Skris .EXIT 55768651Skris LDD -112(%sp),%r5 ; restore r5 55868651Skris LDD -120(%sp),%r4 ; restore r4 55968651Skris BVE (%rp) 56068651Skris LDD,MB -128(%sp),%r3 56168651Skris .PROCEND ;in=23,24,25,26,29;out=28; 56268651Skris 56368651Skris 56468651Skris;---------------------------------------------------------------------------- 56568651Skris; 56668651Skris;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 56768651Skris; 56868651Skris; arg0 = rp 56968651Skris; arg1 = ap 57068651Skris; arg2 = bp 57168651Skris; arg3 = n 57268651Skris 57368651Skrist .reg %r22 57468651Skrisb .reg %r21 57568651Skrisl .reg %r20 57668651Skris 57768651Skrisbn_add_words 57868651Skris .proc 57968651Skris .entry 58068651Skris .callinfo 58168651Skris .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 58268651Skris .align 64 58368651Skris 58468651Skris CMPIB,>= 0,n,bn_add_words_exit 58568651Skris COPY %r0,%ret0 ; return 0 by default 58668651Skris 58768651Skris ; 58868651Skris ; If 2 or more numbers do the loop 58968651Skris ; 59068651Skris CMPIB,= 1,n,bn_add_words_single_top 59168651Skris NOP 59268651Skris 59368651Skris ; 59468651Skris ; This loop is unrolled 2 times (64-byte aligned as well) 59568651Skris ; 59668651Skrisbn_add_words_unroll2 59768651Skris LDD 0(a_ptr),t 59868651Skris LDD 0(b_ptr),b 59968651Skris ADD t,%ret0,t ; t = t+c; 60068651Skris ADD,DC %r0,%r0,%ret0 ; set c to carry 60168651Skris ADD t,b,l ; l = t + b[0] 60268651Skris ADD,DC %ret0,%r0,%ret0 ; c+= carry 60368651Skris STD l,0(r_ptr) 60468651Skris 60568651Skris LDD 8(a_ptr),t 60668651Skris LDD 8(b_ptr),b 60768651Skris ADD t,%ret0,t ; t = t+c; 60868651Skris ADD,DC %r0,%r0,%ret0 ; set c to carry 60968651Skris ADD t,b,l ; l = t + b[0] 61068651Skris ADD,DC %ret0,%r0,%ret0 ; c+= carry 61168651Skris STD l,8(r_ptr) 61268651Skris 61368651Skris LDO -2(n),n 61468651Skris LDO 16(a_ptr),a_ptr 61568651Skris LDO 16(b_ptr),b_ptr 61668651Skris 61768651Skris CMPIB,<= 2,n,bn_add_words_unroll2 61868651Skris LDO 16(r_ptr),r_ptr 61968651Skris 62068651Skris CMPIB,=,N 0,n,bn_add_words_exit ; are we done? 62168651Skris 62268651Skrisbn_add_words_single_top 62368651Skris LDD 0(a_ptr),t 62468651Skris LDD 0(b_ptr),b 62568651Skris 62668651Skris ADD t,%ret0,t ; t = t+c; 62768651Skris ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??) 62868651Skris ADD t,b,l ; l = t + b[0] 62968651Skris ADD,DC %ret0,%r0,%ret0 ; c+= carry 63068651Skris STD l,0(r_ptr) 63168651Skris 63268651Skrisbn_add_words_exit 63368651Skris .EXIT 63468651Skris BVE (%rp) 63568651Skris NOP 63668651Skris .PROCEND ;in=23,24,25,26,29;out=28; 63768651Skris 63868651Skris;---------------------------------------------------------------------------- 63968651Skris; 64068651Skris;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 64168651Skris; 64268651Skris; arg0 = rp 64368651Skris; arg1 = ap 64468651Skris; arg2 = bp 64568651Skris; arg3 = n 64668651Skris 64768651Skrist1 .reg %r22 64868651Skrist2 .reg %r21 64968651Skrissub_tmp1 .reg %r20 65068651Skrissub_tmp2 .reg %r19 65168651Skris 65268651Skris 65368651Skrisbn_sub_words 65468651Skris .proc 65568651Skris .callinfo 65668651Skris .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 65768651Skris .entry 65868651Skris .align 64 65968651Skris 66068651Skris CMPIB,>= 0,n,bn_sub_words_exit 66168651Skris COPY %r0,%ret0 ; return 0 by default 66268651Skris 66368651Skris ; 66468651Skris ; If 2 or more numbers do the loop 66568651Skris ; 66668651Skris CMPIB,= 1,n,bn_sub_words_single_top 66768651Skris NOP 66868651Skris 66968651Skris ; 67068651Skris ; This loop is unrolled 2 times (64-byte aligned as well) 67168651Skris ; 67268651Skrisbn_sub_words_unroll2 67368651Skris LDD 0(a_ptr),t1 67468651Skris LDD 0(b_ptr),t2 67568651Skris SUB t1,t2,sub_tmp1 ; t3 = t1-t2; 67668651Skris SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; 67768651Skris 67868651Skris CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 67968651Skris LDO 1(%r0),sub_tmp2 68068651Skris 68168651Skris CMPCLR,*= t1,t2,%r0 68268651Skris COPY sub_tmp2,%ret0 68368651Skris STD sub_tmp1,0(r_ptr) 68468651Skris 68568651Skris LDD 8(a_ptr),t1 68668651Skris LDD 8(b_ptr),t2 68768651Skris SUB t1,t2,sub_tmp1 ; t3 = t1-t2; 68868651Skris SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; 68968651Skris CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 69068651Skris LDO 1(%r0),sub_tmp2 69168651Skris 69268651Skris CMPCLR,*= t1,t2,%r0 69368651Skris COPY sub_tmp2,%ret0 69468651Skris STD sub_tmp1,8(r_ptr) 69568651Skris 69668651Skris LDO -2(n),n 69768651Skris LDO 16(a_ptr),a_ptr 69868651Skris LDO 16(b_ptr),b_ptr 69968651Skris 70068651Skris CMPIB,<= 2,n,bn_sub_words_unroll2 70168651Skris LDO 16(r_ptr),r_ptr 70268651Skris 70368651Skris CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? 70468651Skris 70568651Skrisbn_sub_words_single_top 70668651Skris LDD 0(a_ptr),t1 70768651Skris LDD 0(b_ptr),t2 70868651Skris SUB t1,t2,sub_tmp1 ; t3 = t1-t2; 70968651Skris SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; 71068651Skris CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 71168651Skris LDO 1(%r0),sub_tmp2 71268651Skris 71368651Skris CMPCLR,*= t1,t2,%r0 71468651Skris COPY sub_tmp2,%ret0 71568651Skris 71668651Skris STD sub_tmp1,0(r_ptr) 71768651Skris 71868651Skrisbn_sub_words_exit 71968651Skris .EXIT 72068651Skris BVE (%rp) 72168651Skris NOP 72268651Skris .PROCEND ;in=23,24,25,26,29;out=28; 72368651Skris 72468651Skris;------------------------------------------------------------------------------ 72568651Skris; 72668651Skris; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) 72768651Skris; 72868651Skris; arg0 = h 72968651Skris; arg1 = l 73068651Skris; arg2 = d 73168651Skris; 73268651Skris; This is mainly just modified assembly from the compiler, thus the 73368651Skris; lack of variable names. 73468651Skris; 73568651Skris;------------------------------------------------------------------------------ 73668651Skrisbn_div_words 73768651Skris .proc 73868651Skris .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE 73968651Skris .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 74068651Skris .IMPORT BN_num_bits_word,CODE,NO_RELOCATION 74168651Skris .IMPORT __iob,DATA 74268651Skris .IMPORT fprintf,CODE,NO_RELOCATION 74368651Skris .IMPORT abort,CODE,NO_RELOCATION 74468651Skris .IMPORT $$div2U,MILLICODE 74568651Skris .entry 74668651Skris STD %r2,-16(%r30) 74768651Skris STD,MA %r3,352(%r30) 74868651Skris STD %r4,-344(%r30) 74968651Skris STD %r5,-336(%r30) 75068651Skris STD %r6,-328(%r30) 75168651Skris STD %r7,-320(%r30) 75268651Skris STD %r8,-312(%r30) 75368651Skris STD %r9,-304(%r30) 75468651Skris STD %r10,-296(%r30) 75568651Skris 75668651Skris STD %r27,-288(%r30) ; save gp 75768651Skris 75868651Skris COPY %r24,%r3 ; save d 75968651Skris COPY %r26,%r4 ; save h (high 64-bits) 76068651Skris LDO -1(%r0),%ret0 ; return -1 by default 76168651Skris 76268651Skris CMPB,*= %r0,%arg2,$D3 ; if (d == 0) 76368651Skris COPY %r25,%r5 ; save l (low 64-bits) 76468651Skris 76568651Skris LDO -48(%r30),%r29 ; create ap 76668651Skris .CALL ;in=26,29;out=28; 76768651Skris B,L BN_num_bits_word,%r2 76868651Skris COPY %r3,%r26 76968651Skris LDD -288(%r30),%r27 ; restore gp 77068651Skris LDI 64,%r21 77168651Skris 77268651Skris CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward) 77368651Skris COPY %ret0,%r24 ; i 77468651Skris MTSARCM %r24 77568651Skris DEPDI,Z -1,%sar,1,%r29 77668651Skris CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward) 77768651Skris 77868651Skris$00000012 77968651Skris SUBI 64,%r24,%r31 ; i = 64 - i; 78068651Skris CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d) 78168651Skris SUB %r4,%r3,%r4 ; h -= d 78268651Skris CMPB,= %r31,%r0,$0000001A ; if (i) 78368651Skris COPY %r0,%r10 ; ret = 0 78468651Skris MTSARCM %r31 ; i to shift 78568651Skris DEPD,Z %r3,%sar,64,%r3 ; d <<= i; 78668651Skris SUBI 64,%r31,%r19 ; 64 - i; redundent 78768651Skris MTSAR %r19 ; (64 -i) to shift 78868651Skris SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i) 78968651Skris MTSARCM %r31 ; i to shift 79068651Skris DEPD,Z %r5,%sar,64,%r5 ; l <<= i; 79168651Skris 79268651Skris$0000001A 79368651Skris DEPDI,Z -1,31,32,%r19 79468651Skris EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32 79568651Skris EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff 79668651Skris LDO 2(%r0),%r9 79768651Skris STD %r3,-280(%r30) ; "d" to stack 79868651Skris 79968651Skris$0000001C 80068651Skris DEPDI,Z -1,63,32,%r29 ; 80168651Skris EXTRD,U %r4,31,32,%r31 ; h >> 32 80268651Skris CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div 80368651Skris COPY %r4,%r26 80468651Skris EXTRD,U %r4,31,32,%r25 80568651Skris COPY %r6,%r24 80668651Skris .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) 80768651Skris B,L $$div2U,%r2 80868651Skris EXTRD,U %r6,31,32,%r23 80968651Skris DEPD %r28,31,32,%r29 81068651Skris$D2 81168651Skris STD %r29,-272(%r30) ; q 81268651Skris AND %r5,%r19,%r24 ; t & 0xffffffff00000000; 81368651Skris EXTRD,U %r24,31,32,%r24 ; ??? 81468651Skris FLDD -272(%r30),%fr7 ; q 81568651Skris FLDD -280(%r30),%fr8 ; d 81668651Skris XMPYU %fr8L,%fr7L,%fr10 81768651Skris FSTD %fr10,-256(%r30) 81868651Skris XMPYU %fr8L,%fr7R,%fr22 81968651Skris FSTD %fr22,-264(%r30) 82068651Skris XMPYU %fr8R,%fr7L,%fr11 82168651Skris XMPYU %fr8R,%fr7R,%fr23 82268651Skris FSTD %fr11,-232(%r30) 82368651Skris FSTD %fr23,-240(%r30) 82468651Skris LDD -256(%r30),%r28 82568651Skris DEPD,Z %r28,31,32,%r2 82668651Skris LDD -264(%r30),%r20 82768651Skris ADD,L %r20,%r2,%r31 82868651Skris LDD -232(%r30),%r22 82968651Skris DEPD,Z %r22,31,32,%r22 83068651Skris LDD -240(%r30),%r21 83168651Skris B $00000024 ; enter loop 83268651Skris ADD,L %r21,%r22,%r23 83368651Skris 83468651Skris$0000002A 83568651Skris LDO -1(%r29),%r29 83668651Skris SUB %r23,%r8,%r23 83768651Skris$00000024 83868651Skris SUB %r4,%r31,%r25 83968651Skris AND %r25,%r19,%r26 84068651Skris CMPB,*<>,N %r0,%r26,$00000046 ; (forward) 84168651Skris DEPD,Z %r25,31,32,%r20 84268651Skris OR %r20,%r24,%r21 84368651Skris CMPB,*<<,N %r21,%r23,$0000002A ;(backward) 84468651Skris SUB %r31,%r6,%r31 84568651Skris;-------------Break path--------------------- 84668651Skris 84768651Skris$00000046 84868651Skris DEPD,Z %r23,31,32,%r25 ;tl 84968651Skris EXTRD,U %r23,31,32,%r26 ;t 85068651Skris AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L 85168651Skris ADD,L %r31,%r26,%r31 ;th += t; 85268651Skris CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl) 85368651Skris LDO 1(%r31),%r31 ; th++; 85468651Skris CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward) 85568651Skris LDO -1(%r29),%r29 ;q--; 85668651Skris ADD,L %r4,%r3,%r4 ;h += d; 85768651Skris$00000036 85868651Skris ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward) 85968651Skris SUB %r5,%r24,%r28 ; l -= tl; 86068651Skris SUB %r4,%r31,%r24 ; h -= th; 86168651Skris SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32)); 86268651Skris DEPD,Z %r29,31,32,%r10 ; ret = q<<32 86368651Skris b $0000001C 86468651Skris DEPD,Z %r28,31,32,%r5 ; l = l << 32 86568651Skris 86668651Skris$D1 86768651Skris OR %r10,%r29,%r28 ; ret |= q 86868651Skris$D3 86968651Skris LDD -368(%r30),%r2 87068651Skris$D0 87168651Skris LDD -296(%r30),%r10 87268651Skris LDD -304(%r30),%r9 87368651Skris LDD -312(%r30),%r8 87468651Skris LDD -320(%r30),%r7 87568651Skris LDD -328(%r30),%r6 87668651Skris LDD -336(%r30),%r5 87768651Skris LDD -344(%r30),%r4 87868651Skris BVE (%r2) 87968651Skris .EXIT 88068651Skris LDD,MB -352(%r30),%r3 88168651Skris 88268651Skrisbn_div_err_case 88368651Skris MFIA %r6 88468651Skris ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1 88568651Skris LDO R'bn_div_words-bn_div_err_case(%r1),%r6 88668651Skris ADDIL LT'__iob,%r27,%r1 88768651Skris LDD RT'__iob(%r1),%r26 88868651Skris ADDIL L'C$4-bn_div_words,%r6,%r1 88968651Skris LDO R'C$4-bn_div_words(%r1),%r25 89068651Skris LDO 64(%r26),%r26 89168651Skris .CALL ;in=24,25,26,29;out=28; 89268651Skris B,L fprintf,%r2 89368651Skris LDO -48(%r30),%r29 89468651Skris LDD -288(%r30),%r27 89568651Skris .CALL ;in=29; 89668651Skris B,L abort,%r2 89768651Skris LDO -48(%r30),%r29 89868651Skris LDD -288(%r30),%r27 89968651Skris B $D0 90068651Skris LDD -368(%r30),%r2 90168651Skris .PROCEND ;in=24,25,26,29;out=28; 90268651Skris 90368651Skris;---------------------------------------------------------------------------- 90468651Skris; 90568651Skris; Registers to hold 64-bit values to manipulate. The "L" part 90668651Skris; of the register corresponds to the upper 32-bits, while the "R" 90768651Skris; part corresponds to the lower 32-bits 90868651Skris; 90968651Skris; Note, that when using b6 and b7, the code must save these before 91068651Skris; using them because they are callee save registers 91168651Skris; 91268651Skris; 91368651Skris; Floating point registers to use to save values that 91468651Skris; are manipulated. These don't collide with ftemp1-6 and 91568651Skris; are all caller save registers 91668651Skris; 91768651Skrisa0 .reg %fr22 91868651Skrisa0L .reg %fr22L 91968651Skrisa0R .reg %fr22R 92068651Skris 92168651Skrisa1 .reg %fr23 92268651Skrisa1L .reg %fr23L 92368651Skrisa1R .reg %fr23R 92468651Skris 92568651Skrisa2 .reg %fr24 92668651Skrisa2L .reg %fr24L 92768651Skrisa2R .reg %fr24R 92868651Skris 92968651Skrisa3 .reg %fr25 93068651Skrisa3L .reg %fr25L 93168651Skrisa3R .reg %fr25R 93268651Skris 93368651Skrisa4 .reg %fr26 93468651Skrisa4L .reg %fr26L 93568651Skrisa4R .reg %fr26R 93668651Skris 93768651Skrisa5 .reg %fr27 93868651Skrisa5L .reg %fr27L 93968651Skrisa5R .reg %fr27R 94068651Skris 94168651Skrisa6 .reg %fr28 94268651Skrisa6L .reg %fr28L 94368651Skrisa6R .reg %fr28R 94468651Skris 94568651Skrisa7 .reg %fr29 94668651Skrisa7L .reg %fr29L 94768651Skrisa7R .reg %fr29R 94868651Skris 94968651Skrisb0 .reg %fr30 95068651Skrisb0L .reg %fr30L 95168651Skrisb0R .reg %fr30R 95268651Skris 95368651Skrisb1 .reg %fr31 95468651Skrisb1L .reg %fr31L 95568651Skrisb1R .reg %fr31R 95668651Skris 95768651Skris; 95868651Skris; Temporary floating point variables, these are all caller save 95968651Skris; registers 96068651Skris; 96168651Skrisftemp1 .reg %fr4 96268651Skrisftemp2 .reg %fr5 96368651Skrisftemp3 .reg %fr6 96468651Skrisftemp4 .reg %fr7 96568651Skris 96668651Skris; 96768651Skris; The B set of registers when used. 96868651Skris; 96968651Skris 97068651Skrisb2 .reg %fr8 97168651Skrisb2L .reg %fr8L 97268651Skrisb2R .reg %fr8R 97368651Skris 97468651Skrisb3 .reg %fr9 97568651Skrisb3L .reg %fr9L 97668651Skrisb3R .reg %fr9R 97768651Skris 97868651Skrisb4 .reg %fr10 97968651Skrisb4L .reg %fr10L 98068651Skrisb4R .reg %fr10R 98168651Skris 98268651Skrisb5 .reg %fr11 98368651Skrisb5L .reg %fr11L 98468651Skrisb5R .reg %fr11R 98568651Skris 98668651Skrisb6 .reg %fr12 98768651Skrisb6L .reg %fr12L 98868651Skrisb6R .reg %fr12R 98968651Skris 99068651Skrisb7 .reg %fr13 99168651Skrisb7L .reg %fr13L 99268651Skrisb7R .reg %fr13R 99368651Skris 99468651Skrisc1 .reg %r21 ; only reg 99568651Skristemp1 .reg %r20 ; only reg 99668651Skristemp2 .reg %r19 ; only reg 99768651Skristemp3 .reg %r31 ; only reg 99868651Skris 99968651Skrism1 .reg %r28 100068651Skrisc2 .reg %r23 100168651Skrishigh_one .reg %r1 100268651Skrisht .reg %r6 100368651Skrislt .reg %r5 100468651Skrism .reg %r4 100568651Skrisc3 .reg %r3 100668651Skris 100768651SkrisSQR_ADD_C .macro A0L,A0R,C1,C2,C3 100868651Skris XMPYU A0L,A0R,ftemp1 ; m 100968651Skris FSTD ftemp1,-24(%sp) ; store m 101068651Skris 101168651Skris XMPYU A0R,A0R,ftemp2 ; lt 101268651Skris FSTD ftemp2,-16(%sp) ; store lt 101368651Skris 101468651Skris XMPYU A0L,A0L,ftemp3 ; ht 101568651Skris FSTD ftemp3,-8(%sp) ; store ht 101668651Skris 101768651Skris LDD -24(%sp),m ; load m 101868651Skris AND m,high_mask,temp2 ; m & Mask 101968651Skris DEPD,Z m,30,31,temp3 ; m << 32+1 102068651Skris LDD -16(%sp),lt ; lt 102168651Skris 102268651Skris LDD -8(%sp),ht ; ht 102368651Skris EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 102468651Skris ADD temp3,lt,lt ; lt = lt+m 102568651Skris ADD,L ht,temp1,ht ; ht += temp1 102668651Skris ADD,DC ht,%r0,ht ; ht++ 102768651Skris 102868651Skris ADD C1,lt,C1 ; c1=c1+lt 102968651Skris ADD,DC ht,%r0,ht ; ht++ 103068651Skris 103168651Skris ADD C2,ht,C2 ; c2=c2+ht 103268651Skris ADD,DC C3,%r0,C3 ; c3++ 103368651Skris.endm 103468651Skris 103568651SkrisSQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 103668651Skris XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht 103768651Skris FSTD ftemp1,-16(%sp) ; 103868651Skris XMPYU A0R,A1L,ftemp2 ; m = bh*lt 103968651Skris FSTD ftemp2,-8(%sp) ; 104068651Skris XMPYU A0R,A1R,ftemp3 ; lt = bl*lt 104168651Skris FSTD ftemp3,-32(%sp) 104268651Skris XMPYU A0L,A1L,ftemp4 ; ht = bh*ht 104368651Skris FSTD ftemp4,-24(%sp) ; 104468651Skris 104568651Skris LDD -8(%sp),m ; r21 = m 104668651Skris LDD -16(%sp),m1 ; r19 = m1 104768651Skris ADD,L m,m1,m ; m+m1 104868651Skris 104968651Skris DEPD,Z m,31,32,temp3 ; (m+m1<<32) 105068651Skris LDD -24(%sp),ht ; r24 = ht 105168651Skris 105268651Skris CMPCLR,*>>= m,m1,%r0 ; if (m < m1) 105368651Skris ADD,L ht,high_one,ht ; ht+=high_one 105468651Skris 105568651Skris EXTRD,U m,31,32,temp1 ; m >> 32 105668651Skris LDD -32(%sp),lt ; lt 105768651Skris ADD,L ht,temp1,ht ; ht+= m>>32 105868651Skris ADD lt,temp3,lt ; lt = lt+m1 105968651Skris ADD,DC ht,%r0,ht ; ht++ 106068651Skris 106168651Skris ADD ht,ht,ht ; ht=ht+ht; 106268651Skris ADD,DC C3,%r0,C3 ; add in carry (c3++) 106368651Skris 106468651Skris ADD lt,lt,lt ; lt=lt+lt; 106568651Skris ADD,DC ht,%r0,ht ; add in carry (ht++) 106668651Skris 106768651Skris ADD C1,lt,C1 ; c1=c1+lt 106868651Skris ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) 106968651Skris LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise 107068651Skris 107168651Skris ADD C2,ht,C2 ; c2 = c2 + ht 107268651Skris ADD,DC C3,%r0,C3 ; add in carry (c3++) 107368651Skris.endm 107468651Skris 107568651Skris; 107668651Skris;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 107768651Skris; arg0 = r_ptr 107868651Skris; arg1 = a_ptr 107968651Skris; 108068651Skris 108168651Skrisbn_sqr_comba8 108268651Skris .PROC 108368651Skris .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 108468651Skris .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 108568651Skris .ENTRY 108668651Skris .align 64 108768651Skris 108868651Skris STD %r3,0(%sp) ; save r3 108968651Skris STD %r4,8(%sp) ; save r4 109068651Skris STD %r5,16(%sp) ; save r5 109168651Skris STD %r6,24(%sp) ; save r6 109268651Skris 109368651Skris ; 109468651Skris ; Zero out carries 109568651Skris ; 109668651Skris COPY %r0,c1 109768651Skris COPY %r0,c2 109868651Skris COPY %r0,c3 109968651Skris 110068651Skris LDO 128(%sp),%sp ; bump stack 110168651Skris DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L 110268651Skris DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 110368651Skris 110468651Skris ; 110568651Skris ; Load up all of the values we are going to use 110668651Skris ; 110768651Skris FLDD 0(a_ptr),a0 110868651Skris FLDD 8(a_ptr),a1 110968651Skris FLDD 16(a_ptr),a2 111068651Skris FLDD 24(a_ptr),a3 111168651Skris FLDD 32(a_ptr),a4 111268651Skris FLDD 40(a_ptr),a5 111368651Skris FLDD 48(a_ptr),a6 111468651Skris FLDD 56(a_ptr),a7 111568651Skris 111668651Skris SQR_ADD_C a0L,a0R,c1,c2,c3 111768651Skris STD c1,0(r_ptr) ; r[0] = c1; 111868651Skris COPY %r0,c1 111968651Skris 112068651Skris SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 112168651Skris STD c2,8(r_ptr) ; r[1] = c2; 112268651Skris COPY %r0,c2 112368651Skris 112468651Skris SQR_ADD_C a1L,a1R,c3,c1,c2 112568651Skris SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 112668651Skris STD c3,16(r_ptr) ; r[2] = c3; 112768651Skris COPY %r0,c3 112868651Skris 112968651Skris SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 113068651Skris SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 113168651Skris STD c1,24(r_ptr) ; r[3] = c1; 113268651Skris COPY %r0,c1 113368651Skris 113468651Skris SQR_ADD_C a2L,a2R,c2,c3,c1 113568651Skris SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 113668651Skris SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 113768651Skris STD c2,32(r_ptr) ; r[4] = c2; 113868651Skris COPY %r0,c2 113968651Skris 114068651Skris SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 114168651Skris SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 114268651Skris SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 114368651Skris STD c3,40(r_ptr) ; r[5] = c3; 114468651Skris COPY %r0,c3 114568651Skris 114668651Skris SQR_ADD_C a3L,a3R,c1,c2,c3 114768651Skris SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 114868651Skris SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 114968651Skris SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 115068651Skris STD c1,48(r_ptr) ; r[6] = c1; 115168651Skris COPY %r0,c1 115268651Skris 115368651Skris SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 115468651Skris SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 115568651Skris SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 115668651Skris SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 115768651Skris STD c2,56(r_ptr) ; r[7] = c2; 115868651Skris COPY %r0,c2 115968651Skris 116068651Skris SQR_ADD_C a4L,a4R,c3,c1,c2 116168651Skris SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 116268651Skris SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 116368651Skris SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 116468651Skris STD c3,64(r_ptr) ; r[8] = c3; 116568651Skris COPY %r0,c3 116668651Skris 116768651Skris SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 116868651Skris SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 116968651Skris SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 117068651Skris STD c1,72(r_ptr) ; r[9] = c1; 117168651Skris COPY %r0,c1 117268651Skris 117368651Skris SQR_ADD_C a5L,a5R,c2,c3,c1 117468651Skris SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 117568651Skris SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 117668651Skris STD c2,80(r_ptr) ; r[10] = c2; 117768651Skris COPY %r0,c2 117868651Skris 117968651Skris SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 118068651Skris SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 118168651Skris STD c3,88(r_ptr) ; r[11] = c3; 118268651Skris COPY %r0,c3 118368651Skris 118468651Skris SQR_ADD_C a6L,a6R,c1,c2,c3 118568651Skris SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 118668651Skris STD c1,96(r_ptr) ; r[12] = c1; 118768651Skris COPY %r0,c1 118868651Skris 118968651Skris SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 119068651Skris STD c2,104(r_ptr) ; r[13] = c2; 119168651Skris COPY %r0,c2 119268651Skris 119368651Skris SQR_ADD_C a7L,a7R,c3,c1,c2 119468651Skris STD c3, 112(r_ptr) ; r[14] = c3 119568651Skris STD c1, 120(r_ptr) ; r[15] = c1 119668651Skris 119768651Skris .EXIT 119868651Skris LDD -104(%sp),%r6 ; restore r6 119968651Skris LDD -112(%sp),%r5 ; restore r5 120068651Skris LDD -120(%sp),%r4 ; restore r4 120168651Skris BVE (%rp) 120268651Skris LDD,MB -128(%sp),%r3 120368651Skris 120468651Skris .PROCEND 120568651Skris 120668651Skris;----------------------------------------------------------------------------- 120768651Skris; 120868651Skris;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 120968651Skris; arg0 = r_ptr 121068651Skris; arg1 = a_ptr 121168651Skris; 121268651Skris 121368651Skrisbn_sqr_comba4 121468651Skris .proc 121568651Skris .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 121668651Skris .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 121768651Skris .entry 121868651Skris .align 64 121968651Skris STD %r3,0(%sp) ; save r3 122068651Skris STD %r4,8(%sp) ; save r4 122168651Skris STD %r5,16(%sp) ; save r5 122268651Skris STD %r6,24(%sp) ; save r6 122368651Skris 122468651Skris ; 122568651Skris ; Zero out carries 122668651Skris ; 122768651Skris COPY %r0,c1 122868651Skris COPY %r0,c2 122968651Skris COPY %r0,c3 123068651Skris 123168651Skris LDO 128(%sp),%sp ; bump stack 123268651Skris DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L 123368651Skris DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 123468651Skris 123568651Skris ; 123668651Skris ; Load up all of the values we are going to use 123768651Skris ; 123868651Skris FLDD 0(a_ptr),a0 123968651Skris FLDD 8(a_ptr),a1 124068651Skris FLDD 16(a_ptr),a2 124168651Skris FLDD 24(a_ptr),a3 124268651Skris FLDD 32(a_ptr),a4 124368651Skris FLDD 40(a_ptr),a5 124468651Skris FLDD 48(a_ptr),a6 124568651Skris FLDD 56(a_ptr),a7 124668651Skris 124768651Skris SQR_ADD_C a0L,a0R,c1,c2,c3 124868651Skris 124968651Skris STD c1,0(r_ptr) ; r[0] = c1; 125068651Skris COPY %r0,c1 125168651Skris 125268651Skris SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 125368651Skris 125468651Skris STD c2,8(r_ptr) ; r[1] = c2; 125568651Skris COPY %r0,c2 125668651Skris 125768651Skris SQR_ADD_C a1L,a1R,c3,c1,c2 125868651Skris SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 125968651Skris 126068651Skris STD c3,16(r_ptr) ; r[2] = c3; 126168651Skris COPY %r0,c3 126268651Skris 126368651Skris SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 126468651Skris SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 126568651Skris 126668651Skris STD c1,24(r_ptr) ; r[3] = c1; 126768651Skris COPY %r0,c1 126868651Skris 126968651Skris SQR_ADD_C a2L,a2R,c2,c3,c1 127068651Skris SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 127168651Skris 127268651Skris STD c2,32(r_ptr) ; r[4] = c2; 127368651Skris COPY %r0,c2 127468651Skris 127568651Skris SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 127668651Skris STD c3,40(r_ptr) ; r[5] = c3; 127768651Skris COPY %r0,c3 127868651Skris 127968651Skris SQR_ADD_C a3L,a3R,c1,c2,c3 128068651Skris STD c1,48(r_ptr) ; r[6] = c1; 128168651Skris STD c2,56(r_ptr) ; r[7] = c2; 128268651Skris 128368651Skris .EXIT 128468651Skris LDD -104(%sp),%r6 ; restore r6 128568651Skris LDD -112(%sp),%r5 ; restore r5 128668651Skris LDD -120(%sp),%r4 ; restore r4 128768651Skris BVE (%rp) 128868651Skris LDD,MB -128(%sp),%r3 128968651Skris 129068651Skris .PROCEND 129168651Skris 129268651Skris 129368651Skris;--------------------------------------------------------------------------- 129468651Skris 129568651SkrisMUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 129668651Skris XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht 129768651Skris FSTD ftemp1,-16(%sp) ; 129868651Skris XMPYU A0R,B0L,ftemp2 ; m = bh*lt 129968651Skris FSTD ftemp2,-8(%sp) ; 130068651Skris XMPYU A0R,B0R,ftemp3 ; lt = bl*lt 130168651Skris FSTD ftemp3,-32(%sp) 130268651Skris XMPYU A0L,B0L,ftemp4 ; ht = bh*ht 130368651Skris FSTD ftemp4,-24(%sp) ; 130468651Skris 130568651Skris LDD -8(%sp),m ; r21 = m 130668651Skris LDD -16(%sp),m1 ; r19 = m1 130768651Skris ADD,L m,m1,m ; m+m1 130868651Skris 130968651Skris DEPD,Z m,31,32,temp3 ; (m+m1<<32) 131068651Skris LDD -24(%sp),ht ; r24 = ht 131168651Skris 131268651Skris CMPCLR,*>>= m,m1,%r0 ; if (m < m1) 131368651Skris ADD,L ht,high_one,ht ; ht+=high_one 131468651Skris 131568651Skris EXTRD,U m,31,32,temp1 ; m >> 32 131668651Skris LDD -32(%sp),lt ; lt 131768651Skris ADD,L ht,temp1,ht ; ht+= m>>32 131868651Skris ADD lt,temp3,lt ; lt = lt+m1 131968651Skris ADD,DC ht,%r0,ht ; ht++ 132068651Skris 132168651Skris ADD C1,lt,C1 ; c1=c1+lt 132268651Skris ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise 132368651Skris 132468651Skris ADD C2,ht,C2 ; c2 = c2 + ht 132568651Skris ADD,DC C3,%r0,C3 ; add in carry (c3++) 132668651Skris.endm 132768651Skris 132868651Skris 132968651Skris; 133068651Skris;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 133168651Skris; arg0 = r_ptr 133268651Skris; arg1 = a_ptr 133368651Skris; arg2 = b_ptr 133468651Skris; 133568651Skris 133668651Skrisbn_mul_comba8 133768651Skris .proc 133868651Skris .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 133968651Skris .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 134068651Skris .entry 134168651Skris .align 64 134268651Skris 134368651Skris STD %r3,0(%sp) ; save r3 134468651Skris STD %r4,8(%sp) ; save r4 134568651Skris STD %r5,16(%sp) ; save r5 134668651Skris STD %r6,24(%sp) ; save r6 134768651Skris FSTD %fr12,32(%sp) ; save r6 134868651Skris FSTD %fr13,40(%sp) ; save r7 134968651Skris 135068651Skris ; 135168651Skris ; Zero out carries 135268651Skris ; 135368651Skris COPY %r0,c1 135468651Skris COPY %r0,c2 135568651Skris COPY %r0,c3 135668651Skris 135768651Skris LDO 128(%sp),%sp ; bump stack 135868651Skris DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 135968651Skris 136068651Skris ; 136168651Skris ; Load up all of the values we are going to use 136268651Skris ; 136368651Skris FLDD 0(a_ptr),a0 136468651Skris FLDD 8(a_ptr),a1 136568651Skris FLDD 16(a_ptr),a2 136668651Skris FLDD 24(a_ptr),a3 136768651Skris FLDD 32(a_ptr),a4 136868651Skris FLDD 40(a_ptr),a5 136968651Skris FLDD 48(a_ptr),a6 137068651Skris FLDD 56(a_ptr),a7 137168651Skris 137268651Skris FLDD 0(b_ptr),b0 137368651Skris FLDD 8(b_ptr),b1 137468651Skris FLDD 16(b_ptr),b2 137568651Skris FLDD 24(b_ptr),b3 137668651Skris FLDD 32(b_ptr),b4 137768651Skris FLDD 40(b_ptr),b5 137868651Skris FLDD 48(b_ptr),b6 137968651Skris FLDD 56(b_ptr),b7 138068651Skris 138168651Skris MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 138268651Skris STD c1,0(r_ptr) 138368651Skris COPY %r0,c1 138468651Skris 138568651Skris MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 138668651Skris MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 138768651Skris STD c2,8(r_ptr) 138868651Skris COPY %r0,c2 138968651Skris 139068651Skris MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 139168651Skris MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 139268651Skris MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 139368651Skris STD c3,16(r_ptr) 139468651Skris COPY %r0,c3 139568651Skris 139668651Skris MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 139768651Skris MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 139868651Skris MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 139968651Skris MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 140068651Skris STD c1,24(r_ptr) 140168651Skris COPY %r0,c1 140268651Skris 140368651Skris MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 140468651Skris MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 140568651Skris MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 140668651Skris MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 140768651Skris MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 140868651Skris STD c2,32(r_ptr) 140968651Skris COPY %r0,c2 141068651Skris 141168651Skris MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 141268651Skris MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 141368651Skris MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 141468651Skris MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 141568651Skris MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 141668651Skris MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 141768651Skris STD c3,40(r_ptr) 141868651Skris COPY %r0,c3 141968651Skris 142068651Skris MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 142168651Skris MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 142268651Skris MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 142368651Skris MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 142468651Skris MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 142568651Skris MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 142668651Skris MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 142768651Skris STD c1,48(r_ptr) 142868651Skris COPY %r0,c1 142968651Skris 143068651Skris MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 143168651Skris MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 143268651Skris MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 143368651Skris MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 143468651Skris MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 143568651Skris MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 143668651Skris MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 143768651Skris MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 143868651Skris STD c2,56(r_ptr) 143968651Skris COPY %r0,c2 144068651Skris 144168651Skris MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 144268651Skris MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 144368651Skris MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 144468651Skris MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 144568651Skris MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 144668651Skris MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 144768651Skris MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 144868651Skris STD c3,64(r_ptr) 144968651Skris COPY %r0,c3 145068651Skris 145168651Skris MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 145268651Skris MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 145368651Skris MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 145468651Skris MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 145568651Skris MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 145668651Skris MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 145768651Skris STD c1,72(r_ptr) 145868651Skris COPY %r0,c1 145968651Skris 146068651Skris MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 146168651Skris MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 146268651Skris MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 146368651Skris MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 146468651Skris MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 146568651Skris STD c2,80(r_ptr) 146668651Skris COPY %r0,c2 146768651Skris 146868651Skris MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 146968651Skris MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 147068651Skris MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 147168651Skris MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 147268651Skris STD c3,88(r_ptr) 147368651Skris COPY %r0,c3 147468651Skris 147568651Skris MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 147668651Skris MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 147768651Skris MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 147868651Skris STD c1,96(r_ptr) 147968651Skris COPY %r0,c1 148068651Skris 148168651Skris MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 148268651Skris MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 148368651Skris STD c2,104(r_ptr) 148468651Skris COPY %r0,c2 148568651Skris 148668651Skris MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 148768651Skris STD c3,112(r_ptr) 148868651Skris STD c1,120(r_ptr) 148968651Skris 149068651Skris .EXIT 149168651Skris FLDD -88(%sp),%fr13 149268651Skris FLDD -96(%sp),%fr12 149368651Skris LDD -104(%sp),%r6 ; restore r6 149468651Skris LDD -112(%sp),%r5 ; restore r5 149568651Skris LDD -120(%sp),%r4 ; restore r4 149668651Skris BVE (%rp) 149768651Skris LDD,MB -128(%sp),%r3 149868651Skris 149968651Skris .PROCEND 150068651Skris 150168651Skris;----------------------------------------------------------------------------- 150268651Skris; 150368651Skris;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 150468651Skris; arg0 = r_ptr 150568651Skris; arg1 = a_ptr 150668651Skris; arg2 = b_ptr 150768651Skris; 150868651Skris 150968651Skrisbn_mul_comba4 151068651Skris .proc 151168651Skris .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 151268651Skris .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 151368651Skris .entry 151468651Skris .align 64 151568651Skris 151668651Skris STD %r3,0(%sp) ; save r3 151768651Skris STD %r4,8(%sp) ; save r4 151868651Skris STD %r5,16(%sp) ; save r5 151968651Skris STD %r6,24(%sp) ; save r6 152068651Skris FSTD %fr12,32(%sp) ; save r6 152168651Skris FSTD %fr13,40(%sp) ; save r7 152268651Skris 152368651Skris ; 152468651Skris ; Zero out carries 152568651Skris ; 152668651Skris COPY %r0,c1 152768651Skris COPY %r0,c2 152868651Skris COPY %r0,c3 152968651Skris 153068651Skris LDO 128(%sp),%sp ; bump stack 153168651Skris DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 153268651Skris 153368651Skris ; 153468651Skris ; Load up all of the values we are going to use 153568651Skris ; 153668651Skris FLDD 0(a_ptr),a0 153768651Skris FLDD 8(a_ptr),a1 153868651Skris FLDD 16(a_ptr),a2 153968651Skris FLDD 24(a_ptr),a3 154068651Skris 154168651Skris FLDD 0(b_ptr),b0 154268651Skris FLDD 8(b_ptr),b1 154368651Skris FLDD 16(b_ptr),b2 154468651Skris FLDD 24(b_ptr),b3 154568651Skris 154668651Skris MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 154768651Skris STD c1,0(r_ptr) 154868651Skris COPY %r0,c1 154968651Skris 155068651Skris MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 155168651Skris MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 155268651Skris STD c2,8(r_ptr) 155368651Skris COPY %r0,c2 155468651Skris 155568651Skris MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 155668651Skris MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 155768651Skris MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 155868651Skris STD c3,16(r_ptr) 155968651Skris COPY %r0,c3 156068651Skris 156168651Skris MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 156268651Skris MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 156368651Skris MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 156468651Skris MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 156568651Skris STD c1,24(r_ptr) 156668651Skris COPY %r0,c1 156768651Skris 156868651Skris MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 156968651Skris MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 157068651Skris MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 157168651Skris STD c2,32(r_ptr) 157268651Skris COPY %r0,c2 157368651Skris 157468651Skris MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 157568651Skris MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 157668651Skris STD c3,40(r_ptr) 157768651Skris COPY %r0,c3 157868651Skris 157968651Skris MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 158068651Skris STD c1,48(r_ptr) 158168651Skris STD c2,56(r_ptr) 158268651Skris 158368651Skris .EXIT 158468651Skris FLDD -88(%sp),%fr13 158568651Skris FLDD -96(%sp),%fr12 158668651Skris LDD -104(%sp),%r6 ; restore r6 158768651Skris LDD -112(%sp),%r5 ; restore r5 158868651Skris LDD -120(%sp),%r4 ; restore r4 158968651Skris BVE (%rp) 159068651Skris LDD,MB -128(%sp),%r3 159168651Skris 159268651Skris .PROCEND 159368651Skris 159468651Skris 159568651Skris .SPACE $TEXT$ 159668651Skris .SUBSPA $CODE$ 159768651Skris .SPACE $PRIVATE$,SORT=16 159868651Skris .IMPORT $global$,DATA 159968651Skris .SPACE $TEXT$ 160068651Skris .SUBSPA $CODE$ 160172613Skris .SUBSPA $LIT$,ACCESS=0x2c 160268651SkrisC$4 160368651Skris .ALIGN 8 160468651Skris .STRINGZ "Division would overflow (%d)\n" 160568651Skris .END 1606