128257Smsmith//===----------------------Hexagon builtin routine ------------------------===// 255939Snsouch// 328257Smsmith// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 428257Smsmith// See https://llvm.org/LICENSE.txt for license information. 528257Smsmith// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 628257Smsmith// 728257Smsmith//===----------------------------------------------------------------------===// 828257Smsmith 928257Smsmith// Double Precision square root 1028257Smsmith 1128257Smsmith#define EXP r28 1228257Smsmith 1328257Smsmith#define A r1:0 1428257Smsmith#define AH r1 1528257Smsmith#define AL r0 1628257Smsmith 1728257Smsmith#define SFSH r3:2 1828257Smsmith#define SF_S r3 1928257Smsmith#define SF_H r2 2028257Smsmith 2128257Smsmith#define SFHALF_SONE r5:4 2228257Smsmith#define S_ONE r4 2328257Smsmith#define SFHALF r5 2428257Smsmith#define SF_D r6 2528257Smsmith#define SF_E r7 26119418Sobrien#define RECIPEST r8 27119418Sobrien#define SFRAD r9 28119418Sobrien 29119418Sobrien#define FRACRAD r11:10 3028257Smsmith#define FRACRADH r11 31187576Sjhb#define FRACRADL r10 3228257Smsmith 3355939Snsouch#define ROOT r13:12 34187576Sjhb#define ROOTHI r13 35187576Sjhb#define ROOTLO r12 3655939Snsouch 3755939Snsouch#define PROD r15:14 3828257Smsmith#define PRODHI r15 39185003Sjhb#define PRODLO r14 4055939Snsouch 4128257Smsmith#define P_TMP p0 4255939Snsouch#define P_EXP1 p1 43185003Sjhb#define NORMAL p2 44119284Simp 45119284Simp#define SF_EXPBITS 8 4655939Snsouch#define SF_MANTBITS 23 47185003Sjhb 4828257Smsmith#define DF_EXPBITS 11 4955939Snsouch#define DF_MANTBITS 52 5028257Smsmith 5155939Snsouch#define DF_BIAS 0x3ff 5228257Smsmith 5328257Smsmith#define DFCLASS_ZERO 0x01 5428257Smsmith#define DFCLASS_NORMAL 0x02 5528257Smsmith#define DFCLASS_DENORMAL 0x02 5655939Snsouch#define DFCLASS_INFINITE 0x08 5755939Snsouch#define DFCLASS_NAN 0x10 5828257Smsmith 59187576Sjhb#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function 6042475Snsouch#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function 6142475Snsouch#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function 6228257Smsmith#define END(TAG) .size TAG,.-TAG 63227814Sattilio 64187576Sjhb .text 6542475Snsouch .global __hexagon_sqrtdf2 6642475Snsouch .type __hexagon_sqrtdf2,@function 6742475Snsouch .global __hexagon_sqrt 6855939Snsouch .type __hexagon_sqrt,@function 6942475Snsouch Q6_ALIAS(sqrtdf2) 7042475Snsouch Q6_ALIAS(sqrt) 7142475Snsouch FAST_ALIAS(sqrtdf2) 7242475Snsouch FAST_ALIAS(sqrt) 7342475Snsouch FAST2_ALIAS(sqrtdf2) 7442475Snsouch FAST2_ALIAS(sqrt) 7542475Snsouch .type sqrt,@function 7642475Snsouch .p2align 5 7755939Snsouch__hexagon_sqrtdf2: 7828257Smsmith__hexagon_sqrt: 7928257Smsmith { 80187576Sjhb PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) 81187576Sjhb EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32) 82187576Sjhb SFHALF_SONE = combine(##0x3f000004,#1) 83187576Sjhb } 84187576Sjhb { 8542475Snsouch NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal 8628257Smsmith NORMAL = cmp.gt(AH,#-1) // and positive? 8728257Smsmith if (!NORMAL.new) jump:nt .Lsqrt_abnormal 8828257Smsmith SFRAD = or(SFHALF,PRODLO) 8928257Smsmith } 9028257Smsmith#undef NORMAL 9128257Smsmith.Ldenormal_restart: 9255939Snsouch { 9338061Smsmith FRACRAD = A 9455939Snsouch SF_E,P_TMP = sfinvsqrta(SFRAD) 9538061Smsmith SFHALF = and(SFHALF,#-16) 9638061Smsmith SFSH = #0 9755939Snsouch } 9838061Smsmith#undef A 9955939Snsouch#undef AH 10038061Smsmith#undef AL 101227814Sattilio#define ERROR r1:0 10255939Snsouch#define ERRORHI r1 10338061Smsmith#define ERRORLO r0 10455939Snsouch // SF_E : reciprocal square root 10555939Snsouch // SF_H : half rsqrt 10655939Snsouch // sf_S : square root 10755939Snsouch // SF_D : error term 10855939Snsouch // SFHALF: 0.5 10955939Snsouch { 11055939Snsouch SF_S += sfmpy(SF_E,SFRAD):lib // s0: root 11155939Snsouch SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5*y0. Could also decrement exponent... 11255939Snsouch SF_D = SFHALF 11355939Snsouch#undef SFRAD 11455939Snsouch#define SHIFTAMT r9 11555939Snsouch SHIFTAMT = and(EXP,#1) 11638061Smsmith } 117227814Sattilio { 11855939Snsouch SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-H*S = 0.5-0.5*~1 11955939Snsouch FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) // replace upper bits with hidden 12038061Smsmith P_EXP1 = cmp.gtu(SHIFTAMT,#0) 12155939Snsouch } 12255939Snsouch { 12355939Snsouch SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt 12455939Snsouch SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip 12555939Snsouch SF_D = SFHALF 12655939Snsouch SHIFTAMT = mux(P_EXP1,#8,#9) 12755939Snsouch } 12855939Snsouch { 12955939Snsouch SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term 13055939Snsouch FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place 13155939Snsouch SHIFTAMT = mux(P_EXP1,#3,#2) 132227814Sattilio } 13363458Sn_hibma { 134185003Sjhb SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt 13555939Snsouch // cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x). 13663458Sn_hibma PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1) 13763458Sn_hibma } 13863458Sn_hibma { 13938061Smsmith SF_H = and(SF_H,##0x007fffff) 14038061Smsmith } 14138061Smsmith { 14238061Smsmith SF_H = add(SF_H,##0x00800000 - 3) 14342475Snsouch SHIFTAMT = mux(P_EXP1,#7,#8) 14442475Snsouch } 14542475Snsouch { 14642475Snsouch RECIPEST = asl(SF_H,SHIFTAMT) 14742475Snsouch SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0)) 14855939Snsouch } 14942475Snsouch { 150187576Sjhb ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1))) 151227814Sattilio } 15255939Snsouch 15342475Snsouch#undef SFSH // r3:2 15442475Snsouch#undef SF_H // r2 15542475Snsouch#undef SF_S // r3 15628257Smsmith#undef S_ONE // r4 15728257Smsmith#undef SFHALF // r5 15838061Smsmith#undef SFHALF_SONE // r5:4 15928257Smsmith#undef SF_D // r6 16028257Smsmith#undef SF_E // r7 16155939Snsouch 16228257Smsmith#define HL r3:2 163187576Sjhb#define LL r5:4 164227814Sattilio#define HH r7:6 16555939Snsouch 16628257Smsmith#undef P_EXP1 16728257Smsmith#define P_CARRY0 p1 16828257Smsmith#define P_CARRY1 p2 16928257Smsmith#define P_CARRY2 p3 17028257Smsmith 17138061Smsmith // Iteration 0 17228257Smsmith // Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply 17328257Smsmith // We can shift and subtract instead of shift and add? 17455939Snsouch { 17528257Smsmith ERROR = asl(FRACRAD,#15) 176187576Sjhb PROD = mpyu(ROOTHI,ROOTHI) 177227814Sattilio P_CARRY0 = cmp.eq(r0,r0) 17855939Snsouch } 17928257Smsmith { 18028257Smsmith ERROR -= asl(PROD,#15) 18128257Smsmith PROD = mpyu(ROOTHI,ROOTLO) 18228257Smsmith P_CARRY1 = cmp.eq(r0,r0) 18328257Smsmith } 18438061Smsmith { 18528257Smsmith ERROR -= lsr(PROD,#16) 18628257Smsmith P_CARRY2 = cmp.eq(r0,r0) 18755939Snsouch } 18828257Smsmith { 18928257Smsmith ERROR = mpyu(ERRORHI,RECIPEST) 19028257Smsmith } 191227814Sattilio { 192187576Sjhb ROOT += lsr(ERROR,SHIFTAMT) 19355939Snsouch SHIFTAMT = add(SHIFTAMT,#16) 19428257Smsmith ERROR = asl(FRACRAD,#31) // for next iter 19528257Smsmith } 19628257Smsmith // Iteration 1 19728257Smsmith { 19839134Snsouch PROD = mpyu(ROOTHI,ROOTHI) 19928257Smsmith ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed 20028257Smsmith } 20128257Smsmith { 20228257Smsmith ERROR -= asl(PROD,#31) 20328257Smsmith PROD = mpyu(ROOTLO,ROOTLO) 204187576Sjhb } 205187576Sjhb { 206187576Sjhb ERROR -= lsr(PROD,#33) 207187576Sjhb } 208187576Sjhb { 209187576Sjhb ERROR = mpyu(ERRORHI,RECIPEST) 210187576Sjhb } 211187576Sjhb { 212187576Sjhb ROOT += lsr(ERROR,SHIFTAMT) 213187576Sjhb SHIFTAMT = add(SHIFTAMT,#16) 214187576Sjhb ERROR = asl(FRACRAD,#47) // for next iter 215187576Sjhb } 216187576Sjhb // Iteration 2 217187576Sjhb { 218187576Sjhb PROD = mpyu(ROOTHI,ROOTHI) 219187576Sjhb } 220187576Sjhb { 221187576Sjhb ERROR -= asl(PROD,#47) 222187576Sjhb PROD = mpyu(ROOTHI,ROOTLO) 223187576Sjhb } 224187576Sjhb { 225227758Sattilio ERROR -= asl(PROD,#16) // bidir shr 31-47 226187576Sjhb PROD = mpyu(ROOTLO,ROOTLO) 227187576Sjhb } 228187576Sjhb { 229187576Sjhb ERROR -= lsr(PROD,#17) // 64-47 230187576Sjhb } 231187576Sjhb { 232187576Sjhb ERROR = mpyu(ERRORHI,RECIPEST) 233187576Sjhb } 234187576Sjhb { 235187576Sjhb ROOT += lsr(ERROR,SHIFTAMT) 236187576Sjhb } 237187576Sjhb#undef ERROR 238187576Sjhb#undef PROD 239187576Sjhb#undef PRODHI 240187576Sjhb#undef PRODLO 241187576Sjhb#define REM_HI r15:14 242187576Sjhb#define REM_HI_HI r15 243#define REM_LO r1:0 244#undef RECIPEST 245#undef SHIFTAMT 246#define TWOROOT_LO r9:8 247 // Adjust Root 248 { 249 HL = mpyu(ROOTHI,ROOTLO) 250 LL = mpyu(ROOTLO,ROOTLO) 251 REM_HI = #0 252 REM_LO = #0 253 } 254 { 255 HL += lsr(LL,#33) 256 LL += asl(HL,#33) 257 P_CARRY0 = cmp.eq(r0,r0) 258 } 259 { 260 HH = mpyu(ROOTHI,ROOTHI) 261 REM_LO = sub(REM_LO,LL,P_CARRY0):carry 262 TWOROOT_LO = #1 263 } 264 { 265 HH += lsr(HL,#31) 266 TWOROOT_LO += asl(ROOT,#1) 267 } 268#undef HL 269#undef LL 270#define REM_HI_TMP r3:2 271#define REM_HI_TMP_HI r3 272#define REM_LO_TMP r5:4 273 { 274 REM_HI = sub(FRACRAD,HH,P_CARRY0):carry 275 REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry 276#undef FRACRAD 277#undef HH 278#define ZERO r11:10 279#define ONE r7:6 280 ONE = #1 281 ZERO = #0 282 } 283 { 284 REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry 285 ONE = add(ROOT,ONE) 286 EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp 287 } 288 { 289 // If carry set, no borrow: result was still positive 290 if (P_CARRY1) ROOT = ONE 291 if (P_CARRY1) REM_LO = REM_LO_TMP 292 if (P_CARRY1) REM_HI = REM_HI_TMP 293 } 294 { 295 REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry 296 ONE = #1 297 EXP = asr(EXP,#1) // divide signed exp by 2 298 } 299 { 300 REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry 301 ONE = add(ROOT,ONE) 302 } 303 { 304 if (P_CARRY2) ROOT = ONE 305 if (P_CARRY2) REM_LO = REM_LO_TMP 306 // since tworoot <= 2^32, remhi must be zero 307#undef REM_HI_TMP 308#undef REM_HI_TMP_HI 309#define S_ONE r2 310#define ADJ r3 311 S_ONE = #1 312 } 313 { 314 P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero 315 if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully 316 ADJ = cl0(ROOT) 317 EXP = add(EXP,#-63) 318 } 319#undef REM_LO 320#define RET r1:0 321#define RETHI r1 322 { 323 RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag 324 EXP = add(EXP,ADJ) // add back bias 325 } 326 { 327 RETHI += asl(EXP,#DF_MANTBITS-32) // add exponent adjust 328 jumpr r31 329 } 330#undef REM_LO_TMP 331#undef REM_HI_TMP 332#undef REM_HI_TMP_HI 333#undef REM_LO 334#undef REM_HI 335#undef TWOROOT_LO 336 337#undef RET 338#define A r1:0 339#define AH r1 340#define AL r1 341#undef S_ONE 342#define TMP r3:2 343#define TMPHI r3 344#define TMPLO r2 345#undef P_CARRY0 346#define P_NEG p1 347 348 349#define SFHALF r5 350#define SFRAD r9 351.Lsqrt_abnormal: 352 { 353 P_TMP = dfclass(A,#DFCLASS_ZERO) // zero? 354 if (P_TMP.new) jumpr:t r31 355 } 356 { 357 P_TMP = dfclass(A,#DFCLASS_NAN) 358 if (P_TMP.new) jump:nt .Lsqrt_nan 359 } 360 { 361 P_TMP = cmp.gt(AH,#-1) 362 if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg 363 if (!P_TMP.new) EXP = ##0x7F800001 // sNaN 364 } 365 { 366 P_TMP = dfclass(A,#DFCLASS_INFINITE) 367 if (P_TMP.new) jumpr:nt r31 368 } 369 // If we got here, we're denormal 370 // prepare to restart 371 { 372 A = extractu(A,#DF_MANTBITS,#0) // Extract mantissa 373 } 374 { 375 EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize? 376 } 377 { 378 A = asl(A,EXP) // Shift mantissa 379 EXP = sub(#1,EXP) // Form exponent 380 } 381 { 382 AH = insert(EXP,#1,#DF_MANTBITS-32) // insert lsb of exponent 383 } 384 { 385 TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1) 386 SFHALF = ##0x3f000004 // form half constant 387 } 388 { 389 SFRAD = or(SFHALF,TMPLO) // form sf value 390 SFHALF = and(SFHALF,#-16) 391 jump .Ldenormal_restart // restart 392 } 393.Lsqrt_nan: 394 { 395 EXP = convert_df2sf(A) // if sNaN, get invalid 396 A = #-1 // qNaN 397 jumpr r31 398 } 399.Lsqrt_invalid_neg: 400 { 401 A = convert_sf2df(EXP) // Invalid,NaNval 402 jumpr r31 403 } 404END(__hexagon_sqrt) 405END(__hexagon_sqrtdf2) 406