128257Smsmith//===----------------------Hexagon builtin routine ------------------------===//
255939Snsouch//
328257Smsmith// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
428257Smsmith// See https://llvm.org/LICENSE.txt for license information.
528257Smsmith// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
628257Smsmith//
728257Smsmith//===----------------------------------------------------------------------===//
828257Smsmith
928257Smsmith// Double Precision square root
1028257Smsmith
1128257Smsmith#define EXP r28
1228257Smsmith
1328257Smsmith#define A r1:0
1428257Smsmith#define AH r1
1528257Smsmith#define AL r0
1628257Smsmith
1728257Smsmith#define SFSH r3:2
1828257Smsmith#define SF_S r3
1928257Smsmith#define SF_H r2
2028257Smsmith
2128257Smsmith#define SFHALF_SONE r5:4
2228257Smsmith#define S_ONE r4
2328257Smsmith#define SFHALF r5
2428257Smsmith#define SF_D r6
2528257Smsmith#define SF_E r7
26119418Sobrien#define RECIPEST r8
27119418Sobrien#define SFRAD r9
28119418Sobrien
29119418Sobrien#define FRACRAD r11:10
3028257Smsmith#define FRACRADH r11
31187576Sjhb#define FRACRADL r10
3228257Smsmith
3355939Snsouch#define ROOT r13:12
34187576Sjhb#define ROOTHI r13
35187576Sjhb#define ROOTLO r12
3655939Snsouch
3755939Snsouch#define PROD r15:14
3828257Smsmith#define PRODHI r15
39185003Sjhb#define PRODLO r14
4055939Snsouch
4128257Smsmith#define P_TMP p0
4255939Snsouch#define P_EXP1 p1
43185003Sjhb#define NORMAL p2
44119284Simp
45119284Simp#define SF_EXPBITS 8
4655939Snsouch#define SF_MANTBITS 23
47185003Sjhb
4828257Smsmith#define DF_EXPBITS 11
4955939Snsouch#define DF_MANTBITS 52
5028257Smsmith
5155939Snsouch#define DF_BIAS 0x3ff
5228257Smsmith
5328257Smsmith#define DFCLASS_ZERO     0x01
5428257Smsmith#define DFCLASS_NORMAL   0x02
5528257Smsmith#define DFCLASS_DENORMAL 0x02
5655939Snsouch#define DFCLASS_INFINITE 0x08
5755939Snsouch#define DFCLASS_NAN      0x10
5828257Smsmith
59187576Sjhb#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function
6042475Snsouch#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function
6142475Snsouch#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function
6228257Smsmith#define END(TAG) .size TAG,.-TAG
63227814Sattilio
64187576Sjhb	.text
6542475Snsouch	.global __hexagon_sqrtdf2
6642475Snsouch	.type __hexagon_sqrtdf2,@function
6742475Snsouch	.global __hexagon_sqrt
6855939Snsouch	.type __hexagon_sqrt,@function
6942475Snsouch	Q6_ALIAS(sqrtdf2)
7042475Snsouch	Q6_ALIAS(sqrt)
7142475Snsouch	FAST_ALIAS(sqrtdf2)
7242475Snsouch	FAST_ALIAS(sqrt)
7342475Snsouch	FAST2_ALIAS(sqrtdf2)
7442475Snsouch	FAST2_ALIAS(sqrt)
7542475Snsouch	.type sqrt,@function
7642475Snsouch	.p2align 5
7755939Snsouch__hexagon_sqrtdf2:
7828257Smsmith__hexagon_sqrt:
7928257Smsmith	{
80187576Sjhb		PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)
81187576Sjhb		EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
82187576Sjhb		SFHALF_SONE = combine(##0x3f000004,#1)
83187576Sjhb	}
84187576Sjhb	{
8542475Snsouch		NORMAL = dfclass(A,#DFCLASS_NORMAL)		// Is it normal
8628257Smsmith		NORMAL = cmp.gt(AH,#-1)				// and positive?
8728257Smsmith		if (!NORMAL.new) jump:nt .Lsqrt_abnormal
8828257Smsmith		SFRAD = or(SFHALF,PRODLO)
8928257Smsmith	}
9028257Smsmith#undef NORMAL
9128257Smsmith.Ldenormal_restart:
9255939Snsouch	{
9338061Smsmith		FRACRAD = A
9455939Snsouch		SF_E,P_TMP = sfinvsqrta(SFRAD)
9538061Smsmith		SFHALF = and(SFHALF,#-16)
9638061Smsmith		SFSH = #0
9755939Snsouch	}
9838061Smsmith#undef A
9955939Snsouch#undef AH
10038061Smsmith#undef AL
101227814Sattilio#define ERROR r1:0
10255939Snsouch#define ERRORHI r1
10338061Smsmith#define ERRORLO r0
10455939Snsouch	// SF_E : reciprocal square root
10555939Snsouch	// SF_H : half rsqrt
10655939Snsouch	// sf_S : square root
10755939Snsouch	// SF_D : error term
10855939Snsouch	// SFHALF: 0.5
10955939Snsouch	{
11055939Snsouch		SF_S += sfmpy(SF_E,SFRAD):lib		// s0: root
11155939Snsouch		SF_H += sfmpy(SF_E,SFHALF):lib		// h0: 0.5*y0. Could also decrement exponent...
11255939Snsouch		SF_D = SFHALF
11355939Snsouch#undef SFRAD
11455939Snsouch#define SHIFTAMT r9
11555939Snsouch		SHIFTAMT = and(EXP,#1)
11638061Smsmith	}
117227814Sattilio	{
11855939Snsouch		SF_D -= sfmpy(SF_S,SF_H):lib		// d0: 0.5-H*S = 0.5-0.5*~1
11955939Snsouch		FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32)	// replace upper bits with hidden
12038061Smsmith		P_EXP1 = cmp.gtu(SHIFTAMT,#0)
12155939Snsouch	}
12255939Snsouch	{
12355939Snsouch		SF_S += sfmpy(SF_S,SF_D):lib		// s1: refine sqrt
12455939Snsouch		SF_H += sfmpy(SF_H,SF_D):lib		// h1: refine half-recip
12555939Snsouch		SF_D = SFHALF
12655939Snsouch		SHIFTAMT = mux(P_EXP1,#8,#9)
12755939Snsouch	}
12855939Snsouch	{
12955939Snsouch		SF_D -= sfmpy(SF_S,SF_H):lib		// d1: error term
13055939Snsouch		FRACRAD = asl(FRACRAD,SHIFTAMT)		// Move fracrad bits to right place
13155939Snsouch		SHIFTAMT = mux(P_EXP1,#3,#2)
132227814Sattilio	}
13363458Sn_hibma	{
134185003Sjhb		SF_H += sfmpy(SF_H,SF_D):lib		// d2: rsqrt
13555939Snsouch		// cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x).
13663458Sn_hibma		PROD = asl(FRACRAD,SHIFTAMT)		// fracrad<<(2+exp1)
13763458Sn_hibma	}
13863458Sn_hibma	{
13938061Smsmith		SF_H = and(SF_H,##0x007fffff)
14038061Smsmith	}
14138061Smsmith	{
14238061Smsmith		SF_H = add(SF_H,##0x00800000 - 3)
14342475Snsouch		SHIFTAMT = mux(P_EXP1,#7,#8)
14442475Snsouch	}
14542475Snsouch	{
14642475Snsouch		RECIPEST = asl(SF_H,SHIFTAMT)
14742475Snsouch		SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0))
14855939Snsouch	}
14942475Snsouch	{
150187576Sjhb		ROOT = mpyu(RECIPEST,PRODHI)		// root = mpyu_full(recipest,hi(fracrad<<(2+exp1)))
151227814Sattilio	}
15255939Snsouch
15342475Snsouch#undef SFSH	// r3:2
15442475Snsouch#undef SF_H	// r2
15542475Snsouch#undef SF_S	// r3
15628257Smsmith#undef S_ONE	// r4
15728257Smsmith#undef SFHALF	// r5
15838061Smsmith#undef SFHALF_SONE	// r5:4
15928257Smsmith#undef SF_D	// r6
16028257Smsmith#undef SF_E	// r7
16155939Snsouch
16228257Smsmith#define HL r3:2
163187576Sjhb#define LL r5:4
164227814Sattilio#define HH r7:6
16555939Snsouch
16628257Smsmith#undef P_EXP1
16728257Smsmith#define P_CARRY0 p1
16828257Smsmith#define P_CARRY1 p2
16928257Smsmith#define P_CARRY2 p3
17028257Smsmith
17138061Smsmith	// Iteration 0
17228257Smsmith	// Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply
17328257Smsmith	// We can shift and subtract instead of shift and add?
17455939Snsouch	{
17528257Smsmith		ERROR = asl(FRACRAD,#15)
176187576Sjhb		PROD = mpyu(ROOTHI,ROOTHI)
177227814Sattilio		P_CARRY0 = cmp.eq(r0,r0)
17855939Snsouch	}
17928257Smsmith	{
18028257Smsmith		ERROR -= asl(PROD,#15)
18128257Smsmith		PROD = mpyu(ROOTHI,ROOTLO)
18228257Smsmith		P_CARRY1 = cmp.eq(r0,r0)
18328257Smsmith	}
18438061Smsmith	{
18528257Smsmith		ERROR -= lsr(PROD,#16)
18628257Smsmith		P_CARRY2 = cmp.eq(r0,r0)
18755939Snsouch	}
18828257Smsmith	{
18928257Smsmith		ERROR = mpyu(ERRORHI,RECIPEST)
19028257Smsmith	}
191227814Sattilio	{
192187576Sjhb		ROOT += lsr(ERROR,SHIFTAMT)
19355939Snsouch		SHIFTAMT = add(SHIFTAMT,#16)
19428257Smsmith		ERROR = asl(FRACRAD,#31)		// for next iter
19528257Smsmith	}
19628257Smsmith	// Iteration 1
19728257Smsmith	{
19839134Snsouch		PROD = mpyu(ROOTHI,ROOTHI)
19928257Smsmith		ERROR -= mpyu(ROOTHI,ROOTLO)	// amount is 31, no shift needed
20028257Smsmith	}
20128257Smsmith	{
20228257Smsmith		ERROR -= asl(PROD,#31)
20328257Smsmith		PROD = mpyu(ROOTLO,ROOTLO)
204187576Sjhb	}
205187576Sjhb	{
206187576Sjhb		ERROR -= lsr(PROD,#33)
207187576Sjhb	}
208187576Sjhb	{
209187576Sjhb		ERROR = mpyu(ERRORHI,RECIPEST)
210187576Sjhb	}
211187576Sjhb	{
212187576Sjhb		ROOT += lsr(ERROR,SHIFTAMT)
213187576Sjhb		SHIFTAMT = add(SHIFTAMT,#16)
214187576Sjhb		ERROR = asl(FRACRAD,#47)	// for next iter
215187576Sjhb	}
216187576Sjhb	// Iteration 2
217187576Sjhb	{
218187576Sjhb		PROD = mpyu(ROOTHI,ROOTHI)
219187576Sjhb	}
220187576Sjhb	{
221187576Sjhb		ERROR -= asl(PROD,#47)
222187576Sjhb		PROD = mpyu(ROOTHI,ROOTLO)
223187576Sjhb	}
224187576Sjhb	{
225227758Sattilio		ERROR -= asl(PROD,#16)		// bidir shr 31-47
226187576Sjhb		PROD = mpyu(ROOTLO,ROOTLO)
227187576Sjhb	}
228187576Sjhb	{
229187576Sjhb		ERROR -= lsr(PROD,#17)		// 64-47
230187576Sjhb	}
231187576Sjhb	{
232187576Sjhb		ERROR = mpyu(ERRORHI,RECIPEST)
233187576Sjhb	}
234187576Sjhb	{
235187576Sjhb		ROOT += lsr(ERROR,SHIFTAMT)
236187576Sjhb	}
237187576Sjhb#undef ERROR
238187576Sjhb#undef PROD
239187576Sjhb#undef PRODHI
240187576Sjhb#undef PRODLO
241187576Sjhb#define REM_HI r15:14
242187576Sjhb#define REM_HI_HI r15
243#define REM_LO r1:0
244#undef RECIPEST
245#undef SHIFTAMT
246#define TWOROOT_LO r9:8
247	// Adjust Root
248	{
249		HL = mpyu(ROOTHI,ROOTLO)
250		LL = mpyu(ROOTLO,ROOTLO)
251		REM_HI = #0
252		REM_LO = #0
253	}
254	{
255		HL += lsr(LL,#33)
256		LL += asl(HL,#33)
257		P_CARRY0 = cmp.eq(r0,r0)
258	}
259	{
260		HH = mpyu(ROOTHI,ROOTHI)
261		REM_LO = sub(REM_LO,LL,P_CARRY0):carry
262		TWOROOT_LO = #1
263	}
264	{
265		HH += lsr(HL,#31)
266		TWOROOT_LO += asl(ROOT,#1)
267	}
268#undef HL
269#undef LL
270#define REM_HI_TMP r3:2
271#define REM_HI_TMP_HI r3
272#define REM_LO_TMP r5:4
273	{
274		REM_HI = sub(FRACRAD,HH,P_CARRY0):carry
275		REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry
276#undef FRACRAD
277#undef HH
278#define ZERO r11:10
279#define ONE r7:6
280		ONE = #1
281		ZERO = #0
282	}
283	{
284		REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry
285		ONE = add(ROOT,ONE)
286		EXP = add(EXP,#-DF_BIAS)			// subtract bias --> signed exp
287	}
288	{
289				// If carry set, no borrow: result was still positive
290		if (P_CARRY1) ROOT = ONE
291		if (P_CARRY1) REM_LO = REM_LO_TMP
292		if (P_CARRY1) REM_HI = REM_HI_TMP
293	}
294	{
295		REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry
296		ONE = #1
297		EXP = asr(EXP,#1)				// divide signed exp by 2
298	}
299	{
300		REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry
301		ONE = add(ROOT,ONE)
302	}
303	{
304		if (P_CARRY2) ROOT = ONE
305		if (P_CARRY2) REM_LO = REM_LO_TMP
306								// since tworoot <= 2^32, remhi must be zero
307#undef REM_HI_TMP
308#undef REM_HI_TMP_HI
309#define S_ONE r2
310#define ADJ r3
311		S_ONE = #1
312	}
313	{
314		P_TMP = cmp.eq(REM_LO,ZERO)			// is the low part zero
315		if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE)	// if so, it's exact... hopefully
316		ADJ = cl0(ROOT)
317		EXP = add(EXP,#-63)
318	}
319#undef REM_LO
320#define RET r1:0
321#define RETHI r1
322	{
323		RET = convert_ud2df(ROOT)			// set up mantissa, maybe set inexact flag
324		EXP = add(EXP,ADJ)				// add back bias
325	}
326	{
327		RETHI += asl(EXP,#DF_MANTBITS-32)		// add exponent adjust
328		jumpr r31
329	}
330#undef REM_LO_TMP
331#undef REM_HI_TMP
332#undef REM_HI_TMP_HI
333#undef REM_LO
334#undef REM_HI
335#undef TWOROOT_LO
336
337#undef RET
338#define A r1:0
339#define AH r1
340#define AL r1
341#undef S_ONE
342#define TMP r3:2
343#define TMPHI r3
344#define TMPLO r2
345#undef P_CARRY0
346#define P_NEG p1
347
348
349#define SFHALF r5
350#define SFRAD r9
351.Lsqrt_abnormal:
352	{
353		P_TMP = dfclass(A,#DFCLASS_ZERO)			// zero?
354		if (P_TMP.new) jumpr:t r31
355	}
356	{
357		P_TMP = dfclass(A,#DFCLASS_NAN)
358		if (P_TMP.new) jump:nt .Lsqrt_nan
359	}
360	{
361		P_TMP = cmp.gt(AH,#-1)
362		if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg
363		if (!P_TMP.new) EXP = ##0x7F800001			// sNaN
364	}
365	{
366		P_TMP = dfclass(A,#DFCLASS_INFINITE)
367		if (P_TMP.new) jumpr:nt r31
368	}
369	// If we got here, we're denormal
370	// prepare to restart
371	{
372		A = extractu(A,#DF_MANTBITS,#0)		// Extract mantissa
373	}
374	{
375		EXP = add(clb(A),#-DF_EXPBITS)		// how much to normalize?
376	}
377	{
378		A = asl(A,EXP)				// Shift mantissa
379		EXP = sub(#1,EXP)			// Form exponent
380	}
381	{
382		AH = insert(EXP,#1,#DF_MANTBITS-32)		// insert lsb of exponent
383	}
384	{
385		TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)	// get sf value (mant+exp1)
386		SFHALF = ##0x3f000004						// form half constant
387	}
388	{
389		SFRAD = or(SFHALF,TMPLO)			// form sf value
390		SFHALF = and(SFHALF,#-16)
391		jump .Ldenormal_restart				// restart
392	}
393.Lsqrt_nan:
394	{
395		EXP = convert_df2sf(A)				// if sNaN, get invalid
396		A = #-1						// qNaN
397		jumpr r31
398	}
399.Lsqrt_invalid_neg:
400	{
401		A = convert_sf2df(EXP)				// Invalid,NaNval
402		jumpr r31
403	}
404END(__hexagon_sqrt)
405END(__hexagon_sqrtdf2)
406