1/* IEEE-754 double-precision functions for Xtensa
2   Copyright (C) 2006-2015 Free Software Foundation, Inc.
3   Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
4
5   This file is part of GCC.
6
7   GCC is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 3, or (at your option)
10   any later version.
11
12   GCC is distributed in the hope that it will be useful, but WITHOUT
13   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15   License for more details.
16
17   Under Section 7 of GPL version 3, you are granted additional
18   permissions described in the GCC Runtime Library Exception, version
19   3.1, as published by the Free Software Foundation.
20
21   You should have received a copy of the GNU General Public License and
22   a copy of the GCC Runtime Library Exception along with this program;
23   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24   <http://www.gnu.org/licenses/>.  */
25
26#ifdef __XTENSA_EB__
27#define xh a2
28#define xl a3
29#define yh a4
30#define yl a5
31#else
32#define xh a3
33#define xl a2
34#define yh a5
35#define yl a4
36#endif
37
38/*  Warning!  The branch displacements for some Xtensa branch instructions
39    are quite small, and this code has been carefully laid out to keep
40    branch targets in range.  If you change anything, be sure to check that
41    the assembler is not relaxing anything to branch over a jump.  */
42
43#ifdef L_negdf2
44
45	.align	4
46	.global	__negdf2
47	.type	__negdf2, @function
48__negdf2:
49	leaf_entry sp, 16
50	movi	a4, 0x80000000
51	xor	xh, xh, a4
52	leaf_return
53
54#endif /* L_negdf2 */
55
56#ifdef L_addsubdf3
57
58	/* Addition */
59__adddf3_aux:
60
61	/* Handle NaNs and Infinities.  (This code is placed before the
62	   start of the function just to keep it in range of the limited
63	   branch displacements.)  */
64
65.Ladd_xnan_or_inf:
66	/* If y is neither Infinity nor NaN, return x.  */
67	bnall	yh, a6, 1f
68	/* If x is a NaN, return it.  Otherwise, return y.  */
69	slli	a7, xh, 12
70	or	a7, a7, xl
71	beqz	a7, .Ladd_ynan_or_inf
721:	leaf_return
73
74.Ladd_ynan_or_inf:
75	/* Return y.  */
76	mov	xh, yh
77	mov	xl, yl
78	leaf_return
79
80.Ladd_opposite_signs:
81	/* Operand signs differ.  Do a subtraction.  */
82	slli	a7, a6, 11
83	xor	yh, yh, a7
84	j	.Lsub_same_sign
85
86	.align	4
87	.global	__adddf3
88	.type	__adddf3, @function
89__adddf3:
90	leaf_entry sp, 16
91	movi	a6, 0x7ff00000
92
93	/* Check if the two operands have the same sign.  */
94	xor	a7, xh, yh
95	bltz	a7, .Ladd_opposite_signs
96
97.Ladd_same_sign:
98	/* Check if either exponent == 0x7ff (i.e., NaN or Infinity).  */
99	ball	xh, a6, .Ladd_xnan_or_inf
100	ball	yh, a6, .Ladd_ynan_or_inf
101
102	/* Compare the exponents.  The smaller operand will be shifted
103	   right by the exponent difference and added to the larger
104	   one.  */
105	extui	a7, xh, 20, 12
106	extui	a8, yh, 20, 12
107	bltu	a7, a8, .Ladd_shiftx
108
109.Ladd_shifty:
110	/* Check if the smaller (or equal) exponent is zero.  */
111	bnone	yh, a6, .Ladd_yexpzero
112
113	/* Replace yh sign/exponent with 0x001.  */
114	or	yh, yh, a6
115	slli	yh, yh, 11
116	srli	yh, yh, 11
117
118.Ladd_yexpdiff:
119	/* Compute the exponent difference.  Optimize for difference < 32.  */
120	sub	a10, a7, a8
121	bgeui	a10, 32, .Ladd_bigshifty
122
123	/* Shift yh/yl right by the exponent difference.  Any bits that are
124	   shifted out of yl are saved in a9 for rounding the result.  */
125	ssr	a10
126	movi	a9, 0
127	src	a9, yl, a9
128	src	yl, yh, yl
129	srl	yh, yh
130
131.Ladd_addy:
132	/* Do the 64-bit addition.  */
133	add	xl, xl, yl
134	add	xh, xh, yh
135	bgeu	xl, yl, 1f
136	addi	xh, xh, 1
1371:
138	/* Check if the add overflowed into the exponent.  */
139	extui	a10, xh, 20, 12
140	beq	a10, a7, .Ladd_round
141	mov	a8, a7
142	j	.Ladd_carry
143
144.Ladd_yexpzero:
145	/* y is a subnormal value.  Replace its sign/exponent with zero,
146	   i.e., no implicit "1.0", and increment the apparent exponent
147	   because subnormals behave as if they had the minimum (nonzero)
148	   exponent.  Test for the case when both exponents are zero.  */
149	slli	yh, yh, 12
150	srli	yh, yh, 12
151	bnone	xh, a6, .Ladd_bothexpzero
152	addi	a8, a8, 1
153	j	.Ladd_yexpdiff
154
155.Ladd_bothexpzero:
156	/* Both exponents are zero.  Handle this as a special case.  There
157	   is no need to shift or round, and the normal code for handling
158	   a carry into the exponent field will not work because it
159	   assumes there is an implicit "1.0" that needs to be added.  */
160	add	xl, xl, yl
161	add	xh, xh, yh
162	bgeu	xl, yl, 1f
163	addi	xh, xh, 1
1641:	leaf_return
165
166.Ladd_bigshifty:
167	/* Exponent difference > 64 -- just return the bigger value.  */
168	bgeui	a10, 64, 1b
169
170	/* Shift yh/yl right by the exponent difference.  Any bits that are
171	   shifted out are saved in a9 for rounding the result.  */
172	ssr	a10
173	sll	a11, yl		/* lost bits shifted out of yl */
174	src	a9, yh, yl
175	srl	yl, yh
176	movi	yh, 0
177	beqz	a11, .Ladd_addy
178	or	a9, a9, a10	/* any positive, nonzero value will work */
179	j	.Ladd_addy
180
181.Ladd_xexpzero:
182	/* Same as "yexpzero" except skip handling the case when both
183	   exponents are zero.  */
184	slli	xh, xh, 12
185	srli	xh, xh, 12
186	addi	a7, a7, 1
187	j	.Ladd_xexpdiff
188
189.Ladd_shiftx:
190	/* Same thing as the "shifty" code, but with x and y swapped.  Also,
191	   because the exponent difference is always nonzero in this version,
192	   the shift sequence can use SLL and skip loading a constant zero.  */
193	bnone	xh, a6, .Ladd_xexpzero
194
195	or	xh, xh, a6
196	slli	xh, xh, 11
197	srli	xh, xh, 11
198
199.Ladd_xexpdiff:
200	sub	a10, a8, a7
201	bgeui	a10, 32, .Ladd_bigshiftx
202
203	ssr	a10
204	sll	a9, xl
205	src	xl, xh, xl
206	srl	xh, xh
207
208.Ladd_addx:
209	add	xl, xl, yl
210	add	xh, xh, yh
211	bgeu	xl, yl, 1f
212	addi	xh, xh, 1
2131:
214	/* Check if the add overflowed into the exponent.  */
215	extui	a10, xh, 20, 12
216	bne	a10, a8, .Ladd_carry
217
218.Ladd_round:
219	/* Round up if the leftover fraction is >= 1/2.  */
220	bgez	a9, 1f
221	addi	xl, xl, 1
222	beqz	xl, .Ladd_roundcarry
223
224	/* Check if the leftover fraction is exactly 1/2.  */
225	slli	a9, a9, 1
226	beqz	a9, .Ladd_exactlyhalf
2271:	leaf_return
228
229.Ladd_bigshiftx:
230	/* Mostly the same thing as "bigshifty"....  */
231	bgeui	a10, 64, .Ladd_returny
232
233	ssr	a10
234	sll	a11, xl
235	src	a9, xh, xl
236	srl	xl, xh
237	movi	xh, 0
238	beqz	a11, .Ladd_addx
239	or	a9, a9, a10
240	j	.Ladd_addx
241
242.Ladd_returny:
243	mov	xh, yh
244	mov	xl, yl
245	leaf_return
246
247.Ladd_carry:
248	/* The addition has overflowed into the exponent field, so the
249	   value needs to be renormalized.  The mantissa of the result
250	   can be recovered by subtracting the original exponent and
251	   adding 0x100000 (which is the explicit "1.0" for the
252	   mantissa of the non-shifted operand -- the "1.0" for the
253	   shifted operand was already added).  The mantissa can then
254	   be shifted right by one bit.  The explicit "1.0" of the
255	   shifted mantissa then needs to be replaced by the exponent,
256	   incremented by one to account for the normalizing shift.
257	   It is faster to combine these operations: do the shift first
258	   and combine the additions and subtractions.  If x is the
259	   original exponent, the result is:
260	       shifted mantissa - (x << 19) + (1 << 19) + (x << 20)
261	   or:
262	       shifted mantissa + ((x + 1) << 19)
263	   Note that the exponent is incremented here by leaving the
264	   explicit "1.0" of the mantissa in the exponent field.  */
265
266	/* Shift xh/xl right by one bit.  Save the lsb of xl.  */
267	mov	a10, xl
268	ssai	1
269	src	xl, xh, xl
270	srl	xh, xh
271
272	/* See explanation above.  The original exponent is in a8.  */
273	addi	a8, a8, 1
274	slli	a8, a8, 19
275	add	xh, xh, a8
276
277	/* Return an Infinity if the exponent overflowed.  */
278	ball	xh, a6, .Ladd_infinity
279
280	/* Same thing as the "round" code except the msb of the leftover
281	   fraction is bit 0 of a10, with the rest of the fraction in a9.  */
282	bbci.l	a10, 0, 1f
283	addi	xl, xl, 1
284	beqz	xl, .Ladd_roundcarry
285	beqz	a9, .Ladd_exactlyhalf
2861:	leaf_return
287
288.Ladd_infinity:
289	/* Clear the mantissa.  */
290	movi	xl, 0
291	srli	xh, xh, 20
292	slli	xh, xh, 20
293
294	/* The sign bit may have been lost in a carry-out.  Put it back.  */
295	slli	a8, a8, 1
296	or	xh, xh, a8
297	leaf_return
298
299.Ladd_exactlyhalf:
300	/* Round down to the nearest even value.  */
301	srli	xl, xl, 1
302	slli	xl, xl, 1
303	leaf_return
304
305.Ladd_roundcarry:
306	/* xl is always zero when the rounding increment overflows, so
307	   there's no need to round it to an even value.  */
308	addi	xh, xh, 1
309	/* Overflow to the exponent is OK.  */
310	leaf_return
311
312
313	/* Subtraction */
314__subdf3_aux:
315
316	/* Handle NaNs and Infinities.  (This code is placed before the
317	   start of the function just to keep it in range of the limited
318	   branch displacements.)  */
319
320.Lsub_xnan_or_inf:
321	/* If y is neither Infinity nor NaN, return x.  */
322	bnall	yh, a6, 1f
323	/* Both x and y are either NaN or Inf, so the result is NaN.  */
324	movi	a4, 0x80000	/* make it a quiet NaN */
325	or	xh, xh, a4
3261:	leaf_return
327
328.Lsub_ynan_or_inf:
329	/* Negate y and return it.  */
330	slli	a7, a6, 11
331	xor	xh, yh, a7
332	mov	xl, yl
333	leaf_return
334
335.Lsub_opposite_signs:
336	/* Operand signs differ.  Do an addition.  */
337	slli	a7, a6, 11
338	xor	yh, yh, a7
339	j	.Ladd_same_sign
340
341	.align	4
342	.global	__subdf3
343	.type	__subdf3, @function
344__subdf3:
345	leaf_entry sp, 16
346	movi	a6, 0x7ff00000
347
348	/* Check if the two operands have the same sign.  */
349	xor	a7, xh, yh
350	bltz	a7, .Lsub_opposite_signs
351
352.Lsub_same_sign:
353	/* Check if either exponent == 0x7ff (i.e., NaN or Infinity).  */
354	ball	xh, a6, .Lsub_xnan_or_inf
355	ball	yh, a6, .Lsub_ynan_or_inf
356
357	/* Compare the operands.  In contrast to addition, the entire
358	   value matters here.  */
359	extui	a7, xh, 20, 11
360	extui	a8, yh, 20, 11
361	bltu	xh, yh, .Lsub_xsmaller
362	beq	xh, yh, .Lsub_compare_low
363
364.Lsub_ysmaller:
365	/* Check if the smaller (or equal) exponent is zero.  */
366	bnone	yh, a6, .Lsub_yexpzero
367
368	/* Replace yh sign/exponent with 0x001.  */
369	or	yh, yh, a6
370	slli	yh, yh, 11
371	srli	yh, yh, 11
372
373.Lsub_yexpdiff:
374	/* Compute the exponent difference.  Optimize for difference < 32.  */
375	sub	a10, a7, a8
376	bgeui	a10, 32, .Lsub_bigshifty
377
378	/* Shift yh/yl right by the exponent difference.  Any bits that are
379	   shifted out of yl are saved in a9 for rounding the result.  */
380	ssr	a10
381	movi	a9, 0
382	src	a9, yl, a9
383	src	yl, yh, yl
384	srl	yh, yh
385
386.Lsub_suby:
387	/* Do the 64-bit subtraction.  */
388	sub	xh, xh, yh
389	bgeu	xl, yl, 1f
390	addi	xh, xh, -1
3911:	sub	xl, xl, yl
392
393	/* Subtract the leftover bits in a9 from zero and propagate any
394	   borrow from xh/xl.  */
395	neg	a9, a9
396	beqz	a9, 1f
397	addi	a5, xh, -1
398	moveqz	xh, a5, xl
399	addi	xl, xl, -1
4001:
401	/* Check if the subtract underflowed into the exponent.  */
402	extui	a10, xh, 20, 11
403	beq	a10, a7, .Lsub_round
404	j	.Lsub_borrow
405
406.Lsub_compare_low:
407	/* The high words are equal.  Compare the low words.  */
408	bltu	xl, yl, .Lsub_xsmaller
409	bltu	yl, xl, .Lsub_ysmaller
410	/* The operands are equal.  Return 0.0.  */
411	movi	xh, 0
412	movi	xl, 0
4131:	leaf_return
414
415.Lsub_yexpzero:
416	/* y is a subnormal value.  Replace its sign/exponent with zero,
417	   i.e., no implicit "1.0".  Unless x is also a subnormal, increment
418	   y's apparent exponent because subnormals behave as if they had
419	   the minimum (nonzero) exponent.  */
420	slli	yh, yh, 12
421	srli	yh, yh, 12
422	bnone	xh, a6, .Lsub_yexpdiff
423	addi	a8, a8, 1
424	j	.Lsub_yexpdiff
425
426.Lsub_bigshifty:
427	/* Exponent difference > 64 -- just return the bigger value.  */
428	bgeui	a10, 64, 1b
429
430	/* Shift yh/yl right by the exponent difference.  Any bits that are
431	   shifted out are saved in a9 for rounding the result.  */
432	ssr	a10
433	sll	a11, yl		/* lost bits shifted out of yl */
434	src	a9, yh, yl
435	srl	yl, yh
436	movi	yh, 0
437	beqz	a11, .Lsub_suby
438	or	a9, a9, a10	/* any positive, nonzero value will work */
439	j	.Lsub_suby
440
441.Lsub_xsmaller:
442	/* Same thing as the "ysmaller" code, but with x and y swapped and
443	   with y negated.  */
444	bnone	xh, a6, .Lsub_xexpzero
445
446	or	xh, xh, a6
447	slli	xh, xh, 11
448	srli	xh, xh, 11
449
450.Lsub_xexpdiff:
451	sub	a10, a8, a7
452	bgeui	a10, 32, .Lsub_bigshiftx
453
454	ssr	a10
455	movi	a9, 0
456	src	a9, xl, a9
457	src	xl, xh, xl
458	srl	xh, xh
459
460	/* Negate y.  */
461	slli	a11, a6, 11
462	xor	yh, yh, a11
463
464.Lsub_subx:
465	sub	xl, yl, xl
466	sub	xh, yh, xh
467	bgeu	yl, xl, 1f
468	addi	xh, xh, -1
4691:
470	/* Subtract the leftover bits in a9 from zero and propagate any
471	   borrow from xh/xl.  */
472	neg	a9, a9
473	beqz	a9, 1f
474	addi	a5, xh, -1
475	moveqz	xh, a5, xl
476	addi	xl, xl, -1
4771:
478	/* Check if the subtract underflowed into the exponent.  */
479	extui	a10, xh, 20, 11
480	bne	a10, a8, .Lsub_borrow
481
482.Lsub_round:
483	/* Round up if the leftover fraction is >= 1/2.  */
484	bgez	a9, 1f
485	addi	xl, xl, 1
486	beqz	xl, .Lsub_roundcarry
487
488	/* Check if the leftover fraction is exactly 1/2.  */
489	slli	a9, a9, 1
490	beqz	a9, .Lsub_exactlyhalf
4911:	leaf_return
492
493.Lsub_xexpzero:
494	/* Same as "yexpzero".  */
495	slli	xh, xh, 12
496	srli	xh, xh, 12
497	bnone	yh, a6, .Lsub_xexpdiff
498	addi	a7, a7, 1
499	j	.Lsub_xexpdiff
500
501.Lsub_bigshiftx:
502	/* Mostly the same thing as "bigshifty", but with the sign bit of the
503	   shifted value set so that the subsequent subtraction flips the
504	   sign of y.  */
505	bgeui	a10, 64, .Lsub_returny
506
507	ssr	a10
508	sll	a11, xl
509	src	a9, xh, xl
510	srl	xl, xh
511	slli	xh, a6, 11	/* set sign bit of xh */
512	beqz	a11, .Lsub_subx
513	or	a9, a9, a10
514	j	.Lsub_subx
515
516.Lsub_returny:
517	/* Negate and return y.  */
518	slli	a7, a6, 11
519	xor	xh, yh, a7
520	mov	xl, yl
521	leaf_return
522
523.Lsub_borrow:
524	/* The subtraction has underflowed into the exponent field, so the
525	   value needs to be renormalized.  Shift the mantissa left as
526	   needed to remove any leading zeros and adjust the exponent
527	   accordingly.  If the exponent is not large enough to remove
528	   all the leading zeros, the result will be a subnormal value.  */
529
530	slli	a8, xh, 12
531	beqz	a8, .Lsub_xhzero
532	do_nsau	a6, a8, a7, a11
533	srli	a8, a8, 12
534	bge	a6, a10, .Lsub_subnormal
535	addi	a6, a6, 1
536
537.Lsub_shift_lt32:
538	/* Shift the mantissa (a8/xl/a9) left by a6.  */
539	ssl	a6
540	src	a8, a8, xl
541	src	xl, xl, a9
542	sll	a9, a9
543
544	/* Combine the shifted mantissa with the sign and exponent,
545	   decrementing the exponent by a6.  (The exponent has already
546	   been decremented by one due to the borrow from the subtraction,
547	   but adding the mantissa will increment the exponent by one.)  */
548	srli	xh, xh, 20
549	sub	xh, xh, a6
550	slli	xh, xh, 20
551	add	xh, xh, a8
552	j	.Lsub_round
553
554.Lsub_exactlyhalf:
555	/* Round down to the nearest even value.  */
556	srli	xl, xl, 1
557	slli	xl, xl, 1
558	leaf_return
559
560.Lsub_roundcarry:
561	/* xl is always zero when the rounding increment overflows, so
562	   there's no need to round it to an even value.  */
563	addi	xh, xh, 1
564	/* Overflow to the exponent is OK.  */
565	leaf_return
566
567.Lsub_xhzero:
568	/* When normalizing the result, all the mantissa bits in the high
569	   word are zero.  Shift by "20 + (leading zero count of xl) + 1".  */
570	do_nsau	a6, xl, a7, a11
571	addi	a6, a6, 21
572	blt	a10, a6, .Lsub_subnormal
573
574.Lsub_normalize_shift:
575	bltui	a6, 32, .Lsub_shift_lt32
576
577	ssl	a6
578	src	a8, xl, a9
579	sll	xl, a9
580	movi	a9, 0
581
582	srli	xh, xh, 20
583	sub	xh, xh, a6
584	slli	xh, xh, 20
585	add	xh, xh, a8
586	j	.Lsub_round
587
588.Lsub_subnormal:
589	/* The exponent is too small to shift away all the leading zeros.
590	   Set a6 to the current exponent (which has already been
591	   decremented by the borrow) so that the exponent of the result
592	   will be zero.  Do not add 1 to a6 in this case, because: (1)
593	   adding the mantissa will not increment the exponent, so there is
594	   no need to subtract anything extra from the exponent to
595	   compensate, and (2) the effective exponent of a subnormal is 1
596	   not 0 so the shift amount must be 1 smaller than normal. */
597	mov	a6, a10
598	j	.Lsub_normalize_shift
599
600#endif /* L_addsubdf3 */
601
602#ifdef L_muldf3
603
604	/* Multiplication */
605#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
606#define XCHAL_NO_MUL 1
607#endif
608
609__muldf3_aux:
610
611	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
612	   (This code is placed before the start of the function just to
613	   keep it in range of the limited branch displacements.)  */
614
615.Lmul_xexpzero:
616	/* Clear the sign bit of x.  */
617	slli	xh, xh, 1
618	srli	xh, xh, 1
619
620	/* If x is zero, return zero.  */
621	or	a10, xh, xl
622	beqz	a10, .Lmul_return_zero
623
624	/* Normalize x.  Adjust the exponent in a8.  */
625	beqz	xh, .Lmul_xh_zero
626	do_nsau	a10, xh, a11, a12
627	addi	a10, a10, -11
628	ssl	a10
629	src	xh, xh, xl
630	sll	xl, xl
631	movi	a8, 1
632	sub	a8, a8, a10
633	j	.Lmul_xnormalized
634.Lmul_xh_zero:
635	do_nsau	a10, xl, a11, a12
636	addi	a10, a10, -11
637	movi	a8, -31
638	sub	a8, a8, a10
639	ssl	a10
640	bltz	a10, .Lmul_xl_srl
641	sll	xh, xl
642	movi	xl, 0
643	j	.Lmul_xnormalized
644.Lmul_xl_srl:
645	srl	xh, xl
646	sll	xl, xl
647	j	.Lmul_xnormalized
648
649.Lmul_yexpzero:
650	/* Clear the sign bit of y.  */
651	slli	yh, yh, 1
652	srli	yh, yh, 1
653
654	/* If y is zero, return zero.  */
655	or	a10, yh, yl
656	beqz	a10, .Lmul_return_zero
657
658	/* Normalize y.  Adjust the exponent in a9.  */
659	beqz	yh, .Lmul_yh_zero
660	do_nsau	a10, yh, a11, a12
661	addi	a10, a10, -11
662	ssl	a10
663	src	yh, yh, yl
664	sll	yl, yl
665	movi	a9, 1
666	sub	a9, a9, a10
667	j	.Lmul_ynormalized
668.Lmul_yh_zero:
669	do_nsau	a10, yl, a11, a12
670	addi	a10, a10, -11
671	movi	a9, -31
672	sub	a9, a9, a10
673	ssl	a10
674	bltz	a10, .Lmul_yl_srl
675	sll	yh, yl
676	movi	yl, 0
677	j	.Lmul_ynormalized
678.Lmul_yl_srl:
679	srl	yh, yl
680	sll	yl, yl
681	j	.Lmul_ynormalized
682
683.Lmul_return_zero:
684	/* Return zero with the appropriate sign bit.  */
685	srli	xh, a7, 31
686	slli	xh, xh, 31
687	movi	xl, 0
688	j	.Lmul_done
689
690.Lmul_xnan_or_inf:
691	/* If y is zero, return NaN.  */
692	bnez	yl, 1f
693	slli	a8, yh, 1
694	bnez	a8, 1f
695	movi	a4, 0x80000	/* make it a quiet NaN */
696	or	xh, xh, a4
697	j	.Lmul_done
6981:
699	/* If y is NaN, return y.  */
700	bnall	yh, a6, .Lmul_returnx
701	slli	a8, yh, 12
702	or	a8, a8, yl
703	beqz	a8, .Lmul_returnx
704
705.Lmul_returny:
706	mov	xh, yh
707	mov	xl, yl
708
709.Lmul_returnx:
710	/* Set the sign bit and return.  */
711	extui	a7, a7, 31, 1
712	slli	xh, xh, 1
713	ssai	1
714	src	xh, a7, xh
715	j	.Lmul_done
716
717.Lmul_ynan_or_inf:
718	/* If x is zero, return NaN.  */
719	bnez	xl, .Lmul_returny
720	slli	a8, xh, 1
721	bnez	a8, .Lmul_returny
722	movi	a7, 0x80000	/* make it a quiet NaN */
723	or	xh, yh, a7
724	j	.Lmul_done
725
726	.align	4
727	.global	__muldf3
728	.type	__muldf3, @function
729__muldf3:
730#if __XTENSA_CALL0_ABI__
731	leaf_entry sp, 32
732	addi	sp, sp, -32
733	s32i	a12, sp, 16
734	s32i	a13, sp, 20
735	s32i	a14, sp, 24
736	s32i	a15, sp, 28
737#elif XCHAL_NO_MUL
738	/* This is not really a leaf function; allocate enough stack space
739	   to allow CALL12s to a helper function.  */
740	leaf_entry sp, 64
741#else
742	leaf_entry sp, 32
743#endif
744	movi	a6, 0x7ff00000
745
746	/* Get the sign of the result.  */
747	xor	a7, xh, yh
748
749	/* Check for NaN and infinity.  */
750	ball	xh, a6, .Lmul_xnan_or_inf
751	ball	yh, a6, .Lmul_ynan_or_inf
752
753	/* Extract the exponents.  */
754	extui	a8, xh, 20, 11
755	extui	a9, yh, 20, 11
756
757	beqz	a8, .Lmul_xexpzero
758.Lmul_xnormalized:
759	beqz	a9, .Lmul_yexpzero
760.Lmul_ynormalized:
761
762	/* Add the exponents.  */
763	add	a8, a8, a9
764
765	/* Replace sign/exponent fields with explicit "1.0".  */
766	movi	a10, 0x1fffff
767	or	xh, xh, a6
768	and	xh, xh, a10
769	or	yh, yh, a6
770	and	yh, yh, a10
771
772	/* Multiply 64x64 to 128 bits.  The result ends up in xh/xl/a6.
773	   The least-significant word of the result is thrown away except
774	   that if it is nonzero, the lsb of a6 is set to 1.  */
775#if XCHAL_HAVE_MUL32_HIGH
776
777	/* Compute a6 with any carry-outs in a10.  */
778	movi	a10, 0
779	mull	a6, xl, yh
780	mull	a11, xh, yl
781	add	a6, a6, a11
782	bgeu	a6, a11, 1f
783	addi	a10, a10, 1
7841:
785	muluh	a11, xl, yl
786	add	a6, a6, a11
787	bgeu	a6, a11, 1f
788	addi	a10, a10, 1
7891:
790	/* If the low word of the result is nonzero, set the lsb of a6.  */
791	mull	a11, xl, yl
792	beqz	a11, 1f
793	movi	a9, 1
794	or	a6, a6, a9
7951:
796	/* Compute xl with any carry-outs in a9.  */
797	movi	a9, 0
798	mull	a11, xh, yh
799	add	a10, a10, a11
800	bgeu	a10, a11, 1f
801	addi	a9, a9, 1
8021:
803	muluh	a11, xh, yl
804	add	a10, a10, a11
805	bgeu	a10, a11, 1f
806	addi	a9, a9, 1
8071:
808	muluh	xl, xl, yh
809	add	xl, xl, a10
810	bgeu	xl, a10, 1f
811	addi	a9, a9, 1
8121:
813	/* Compute xh.  */
814	muluh	xh, xh, yh
815	add	xh, xh, a9
816
817#else /* ! XCHAL_HAVE_MUL32_HIGH */
818
819	/* Break the inputs into 16-bit chunks and compute 16 32-bit partial
820	   products.  These partial products are:
821
822		0 xll * yll
823
824		1 xll * ylh
825		2 xlh * yll
826
827		3 xll * yhl
828		4 xlh * ylh
829		5 xhl * yll
830
831		6 xll * yhh
832		7 xlh * yhl
833		8 xhl * ylh
834		9 xhh * yll
835
836		10 xlh * yhh
837		11 xhl * yhl
838		12 xhh * ylh
839
840		13 xhl * yhh
841		14 xhh * yhl
842
843		15 xhh * yhh
844
845	   where the input chunks are (hh, hl, lh, ll).  If using the Mul16
846	   or Mul32 multiplier options, these input chunks must be stored in
847	   separate registers.  For Mac16, the UMUL.AA.* opcodes can specify
848	   that the inputs come from either half of the registers, so there
849	   is no need to shift them out ahead of time.  If there is no
850	   multiply hardware, the 16-bit chunks can be extracted when setting
851	   up the arguments to the separate multiply function.  */
852
853	/* Save a7 since it is needed to hold a temporary value.  */
854	s32i	a7, sp, 4
855#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
856	/* Calling a separate multiply function will clobber a0 and requires
857	   use of a8 as a temporary, so save those values now.  (The function
858	   uses a custom ABI so nothing else needs to be saved.)  */
859	s32i	a0, sp, 0
860	s32i	a8, sp, 8
861#endif
862
863#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
864
865#define xlh a12
866#define ylh a13
867#define xhh a14
868#define yhh a15
869
870	/* Get the high halves of the inputs into registers.  */
871	srli	xlh, xl, 16
872	srli	ylh, yl, 16
873	srli	xhh, xh, 16
874	srli	yhh, yh, 16
875
876#define xll xl
877#define yll yl
878#define xhl xh
879#define yhl yh
880
881#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
882	/* Clear the high halves of the inputs.  This does not matter
883	   for MUL16 because the high bits are ignored.  */
884	extui	xl, xl, 0, 16
885	extui	xh, xh, 0, 16
886	extui	yl, yl, 0, 16
887	extui	yh, yh, 0, 16
888#endif
889#endif /* MUL16 || MUL32 */
890
891
892#if XCHAL_HAVE_MUL16
893
894#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
895	mul16u	dst, xreg ## xhalf, yreg ## yhalf
896
897#elif XCHAL_HAVE_MUL32
898
899#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
900	mull	dst, xreg ## xhalf, yreg ## yhalf
901
902#elif XCHAL_HAVE_MAC16
903
904/* The preprocessor insists on inserting a space when concatenating after
905   a period in the definition of do_mul below.  These macros are a workaround
906   using underscores instead of periods when doing the concatenation.  */
907#define umul_aa_ll umul.aa.ll
908#define umul_aa_lh umul.aa.lh
909#define umul_aa_hl umul.aa.hl
910#define umul_aa_hh umul.aa.hh
911
912#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
913	umul_aa_ ## xhalf ## yhalf	xreg, yreg; \
914	rsr	dst, ACCLO
915
916#else /* no multiply hardware */
917
918#define set_arg_l(dst, src) \
919	extui	dst, src, 0, 16
920#define set_arg_h(dst, src) \
921	srli	dst, src, 16
922
923#if __XTENSA_CALL0_ABI__
924#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
925	set_arg_ ## xhalf (a13, xreg); \
926	set_arg_ ## yhalf (a14, yreg); \
927	call0	.Lmul_mulsi3; \
928	mov	dst, a12
929#else
930#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
931	set_arg_ ## xhalf (a14, xreg); \
932	set_arg_ ## yhalf (a15, yreg); \
933	call12	.Lmul_mulsi3; \
934	mov	dst, a14
935#endif /* __XTENSA_CALL0_ABI__ */
936
937#endif /* no multiply hardware */
938
939	/* Add pp1 and pp2 into a10 with carry-out in a9.  */
940	do_mul(a10, xl, l, yl, h)	/* pp 1 */
941	do_mul(a11, xl, h, yl, l)	/* pp 2 */
942	movi	a9, 0
943	add	a10, a10, a11
944	bgeu	a10, a11, 1f
945	addi	a9, a9, 1
9461:
947	/* Initialize a6 with a9/a10 shifted into position.  Note that
948	   this value can be safely incremented without any carry-outs.  */
949	ssai	16
950	src	a6, a9, a10
951
952	/* Compute the low word into a10.  */
953	do_mul(a11, xl, l, yl, l)	/* pp 0 */
954	sll	a10, a10
955	add	a10, a10, a11
956	bgeu	a10, a11, 1f
957	addi	a6, a6, 1
9581:
959	/* Compute the contributions of pp0-5 to a6, with carry-outs in a9.
960	   This is good enough to determine the low half of a6, so that any
961	   nonzero bits from the low word of the result can be collapsed
962	   into a6, freeing up a register.  */
963	movi	a9, 0
964	do_mul(a11, xl, l, yh, l)	/* pp 3 */
965	add	a6, a6, a11
966	bgeu	a6, a11, 1f
967	addi	a9, a9, 1
9681:
969	do_mul(a11, xl, h, yl, h)	/* pp 4 */
970	add	a6, a6, a11
971	bgeu	a6, a11, 1f
972	addi	a9, a9, 1
9731:
974	do_mul(a11, xh, l, yl, l)	/* pp 5 */
975	add	a6, a6, a11
976	bgeu	a6, a11, 1f
977	addi	a9, a9, 1
9781:
979	/* Collapse any nonzero bits from the low word into a6.  */
980	beqz	a10, 1f
981	movi	a11, 1
982	or	a6, a6, a11
9831:
984	/* Add pp6-9 into a11 with carry-outs in a10.  */
985	do_mul(a7, xl, l, yh, h)	/* pp 6 */
986	do_mul(a11, xh, h, yl, l)	/* pp 9 */
987	movi	a10, 0
988	add	a11, a11, a7
989	bgeu	a11, a7, 1f
990	addi	a10, a10, 1
9911:
992	do_mul(a7, xl, h, yh, l)	/* pp 7 */
993	add	a11, a11, a7
994	bgeu	a11, a7, 1f
995	addi	a10, a10, 1
9961:
997	do_mul(a7, xh, l, yl, h)	/* pp 8 */
998	add	a11, a11, a7
999	bgeu	a11, a7, 1f
1000	addi	a10, a10, 1
10011:
1002	/* Shift a10/a11 into position, and add low half of a11 to a6.  */
1003	src	a10, a10, a11
1004	add	a10, a10, a9
1005	sll	a11, a11
1006	add	a6, a6, a11
1007	bgeu	a6, a11, 1f
1008	addi	a10, a10, 1
10091:
1010	/* Add pp10-12 into xl with carry-outs in a9.  */
1011	movi	a9, 0
1012	do_mul(xl, xl, h, yh, h)	/* pp 10 */
1013	add	xl, xl, a10
1014	bgeu	xl, a10, 1f
1015	addi	a9, a9, 1
10161:
1017	do_mul(a10, xh, l, yh, l)	/* pp 11 */
1018	add	xl, xl, a10
1019	bgeu	xl, a10, 1f
1020	addi	a9, a9, 1
10211:
1022	do_mul(a10, xh, h, yl, h)	/* pp 12 */
1023	add	xl, xl, a10
1024	bgeu	xl, a10, 1f
1025	addi	a9, a9, 1
10261:
1027	/* Add pp13-14 into a11 with carry-outs in a10.  */
1028	do_mul(a11, xh, l, yh, h)	/* pp 13 */
1029	do_mul(a7, xh, h, yh, l)	/* pp 14 */
1030	movi	a10, 0
1031	add	a11, a11, a7
1032	bgeu	a11, a7, 1f
1033	addi	a10, a10, 1
10341:
1035	/* Shift a10/a11 into position, and add low half of a11 to a6.  */
1036	src	a10, a10, a11
1037	add	a10, a10, a9
1038	sll	a11, a11
1039	add	xl, xl, a11
1040	bgeu	xl, a11, 1f
1041	addi	a10, a10, 1
10421:
1043	/* Compute xh.  */
1044	do_mul(xh, xh, h, yh, h)	/* pp 15 */
1045	add	xh, xh, a10
1046
1047	/* Restore values saved on the stack during the multiplication.  */
1048	l32i	a7, sp, 4
1049#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
1050	l32i	a0, sp, 0
1051	l32i	a8, sp, 8
1052#endif
1053#endif /* ! XCHAL_HAVE_MUL32_HIGH */
1054
1055	/* Shift left by 12 bits, unless there was a carry-out from the
1056	   multiply, in which case, shift by 11 bits and increment the
1057	   exponent.  Note: It is convenient to use the constant 0x3ff
1058	   instead of 0x400 when removing the extra exponent bias (so that
1059	   it is easy to construct 0x7fe for the overflow check).  Reverse
1060	   the logic here to decrement the exponent sum by one unless there
1061	   was a carry-out.  */
1062	movi	a4, 11
1063	srli	a5, xh, 21 - 12
1064	bnez	a5, 1f
1065	addi	a4, a4, 1
1066	addi	a8, a8, -1
10671:	ssl	a4
1068	src	xh, xh, xl
1069	src	xl, xl, a6
1070	sll	a6, a6
1071
1072	/* Subtract the extra bias from the exponent sum (plus one to account
1073	   for the explicit "1.0" of the mantissa that will be added to the
1074	   exponent in the final result).  */
1075	movi	a4, 0x3ff
1076	sub	a8, a8, a4
1077
1078	/* Check for over/underflow.  The value in a8 is one less than the
1079	   final exponent, so values in the range 0..7fd are OK here.  */
1080	slli	a4, a4, 1	/* 0x7fe */
1081	bgeu	a8, a4, .Lmul_overflow
1082
1083.Lmul_round:
1084	/* Round.  */
1085	bgez	a6, .Lmul_rounded
1086	addi	xl, xl, 1
1087	beqz	xl, .Lmul_roundcarry
1088	slli	a6, a6, 1
1089	beqz	a6, .Lmul_exactlyhalf
1090
1091.Lmul_rounded:
1092	/* Add the exponent to the mantissa.  */
1093	slli	a8, a8, 20
1094	add	xh, xh, a8
1095
1096.Lmul_addsign:
1097	/* Add the sign bit.  */
1098	srli	a7, a7, 31
1099	slli	a7, a7, 31
1100	or	xh, xh, a7
1101
1102.Lmul_done:
1103#if __XTENSA_CALL0_ABI__
1104	l32i	a12, sp, 16
1105	l32i	a13, sp, 20
1106	l32i	a14, sp, 24
1107	l32i	a15, sp, 28
1108	addi	sp, sp, 32
1109#endif
1110	leaf_return
1111
1112.Lmul_exactlyhalf:
1113	/* Round down to the nearest even value.  */
1114	srli	xl, xl, 1
1115	slli	xl, xl, 1
1116	j	.Lmul_rounded
1117
1118.Lmul_roundcarry:
1119	/* xl is always zero when the rounding increment overflows, so
1120	   there's no need to round it to an even value.  */
1121	addi	xh, xh, 1
1122	/* Overflow is OK -- it will be added to the exponent.  */
1123	j	.Lmul_rounded
1124
1125.Lmul_overflow:
1126	bltz	a8, .Lmul_underflow
1127	/* Return +/- Infinity.  */
1128	addi	a8, a4, 1	/* 0x7ff */
1129	slli	xh, a8, 20
1130	movi	xl, 0
1131	j	.Lmul_addsign
1132
1133.Lmul_underflow:
1134	/* Create a subnormal value, where the exponent field contains zero,
1135	   but the effective exponent is 1.  The value of a8 is one less than
1136	   the actual exponent, so just negate it to get the shift amount.  */
1137	neg	a8, a8
1138	mov	a9, a6
1139	ssr	a8
1140	bgeui	a8, 32, .Lmul_bigshift
1141
1142	/* Shift xh/xl right.  Any bits that are shifted out of xl are saved
1143	   in a6 (combined with the shifted-out bits currently in a6) for
1144	   rounding the result.  */
1145	sll	a6, xl
1146	src	xl, xh, xl
1147	srl	xh, xh
1148	j	1f
1149
1150.Lmul_bigshift:
1151	bgeui	a8, 64, .Lmul_flush_to_zero
1152	sll	a10, xl		/* lost bits shifted out of xl */
1153	src	a6, xh, xl
1154	srl	xl, xh
1155	movi	xh, 0
1156	or	a9, a9, a10
1157
1158	/* Set the exponent to zero.  */
11591:	movi	a8, 0
1160
1161	/* Pack any nonzero bits shifted out into a6.  */
1162	beqz	a9, .Lmul_round
1163	movi	a9, 1
1164	or	a6, a6, a9
1165	j	.Lmul_round
1166
1167.Lmul_flush_to_zero:
1168	/* Return zero with the appropriate sign bit.  */
1169	srli	xh, a7, 31
1170	slli	xh, xh, 31
1171	movi	xl, 0
1172	j	.Lmul_done
1173
1174#if XCHAL_NO_MUL
1175
1176	/* For Xtensa processors with no multiply hardware, this simplified
1177	   version of _mulsi3 is used for multiplying 16-bit chunks of
1178	   the floating-point mantissas.  When using CALL0, this function
1179	   uses a custom ABI: the inputs are passed in a13 and a14, the
1180	   result is returned in a12, and a8 and a15 are clobbered.  */
1181	.align	4
1182.Lmul_mulsi3:
1183	leaf_entry sp, 16
1184	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
1185	movi	\dst, 0
11861:	add	\tmp1, \src2, \dst
1187	extui	\tmp2, \src1, 0, 1
1188	movnez	\dst, \tmp1, \tmp2
1189
1190	do_addx2 \tmp1, \src2, \dst, \tmp1
1191	extui	\tmp2, \src1, 1, 1
1192	movnez	\dst, \tmp1, \tmp2
1193
1194	do_addx4 \tmp1, \src2, \dst, \tmp1
1195	extui	\tmp2, \src1, 2, 1
1196	movnez	\dst, \tmp1, \tmp2
1197
1198	do_addx8 \tmp1, \src2, \dst, \tmp1
1199	extui	\tmp2, \src1, 3, 1
1200	movnez	\dst, \tmp1, \tmp2
1201
1202	srli	\src1, \src1, 4
1203	slli	\src2, \src2, 4
1204	bnez	\src1, 1b
1205	.endm
1206#if __XTENSA_CALL0_ABI__
1207	mul_mulsi3_body a12, a13, a14, a15, a8
1208#else
1209	/* The result will be written into a2, so save that argument in a4.  */
1210	mov	a4, a2
1211	mul_mulsi3_body a2, a4, a3, a5, a6
1212#endif
1213	leaf_return
1214#endif /* XCHAL_NO_MUL */
1215#endif /* L_muldf3 */
1216
1217#ifdef L_divdf3
1218
1219	/* Division */
1220__divdf3_aux:
1221
1222	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
1223	   (This code is placed before the start of the function just to
1224	   keep it in range of the limited branch displacements.)  */
1225
1226.Ldiv_yexpzero:
1227	/* Clear the sign bit of y.  */
1228	slli	yh, yh, 1
1229	srli	yh, yh, 1
1230
1231	/* Check for division by zero.  */
1232	or	a10, yh, yl
1233	beqz	a10, .Ldiv_yzero
1234
1235	/* Normalize y.  Adjust the exponent in a9.  */
1236	beqz	yh, .Ldiv_yh_zero
1237	do_nsau	a10, yh, a11, a9
1238	addi	a10, a10, -11
1239	ssl	a10
1240	src	yh, yh, yl
1241	sll	yl, yl
1242	movi	a9, 1
1243	sub	a9, a9, a10
1244	j	.Ldiv_ynormalized
1245.Ldiv_yh_zero:
1246	do_nsau	a10, yl, a11, a9
1247	addi	a10, a10, -11
1248	movi	a9, -31
1249	sub	a9, a9, a10
1250	ssl	a10
1251	bltz	a10, .Ldiv_yl_srl
1252	sll	yh, yl
1253	movi	yl, 0
1254	j	.Ldiv_ynormalized
1255.Ldiv_yl_srl:
1256	srl	yh, yl
1257	sll	yl, yl
1258	j	.Ldiv_ynormalized
1259
1260.Ldiv_yzero:
1261	/* y is zero.  Return NaN if x is also zero; otherwise, infinity.  */
1262	slli	xh, xh, 1
1263	srli	xh, xh, 1
1264	or	xl, xl, xh
1265	srli	xh, a7, 31
1266	slli	xh, xh, 31
1267	or	xh, xh, a6
1268	bnez	xl, 1f
1269	movi	a4, 0x80000	/* make it a quiet NaN */
1270	or	xh, xh, a4
12711:	movi	xl, 0
1272	leaf_return
1273
1274.Ldiv_xexpzero:
1275	/* Clear the sign bit of x.  */
1276	slli	xh, xh, 1
1277	srli	xh, xh, 1
1278
1279	/* If x is zero, return zero.  */
1280	or	a10, xh, xl
1281	beqz	a10, .Ldiv_return_zero
1282
1283	/* Normalize x.  Adjust the exponent in a8.  */
1284	beqz	xh, .Ldiv_xh_zero
1285	do_nsau	a10, xh, a11, a8
1286	addi	a10, a10, -11
1287	ssl	a10
1288	src	xh, xh, xl
1289	sll	xl, xl
1290	movi	a8, 1
1291	sub	a8, a8, a10
1292	j	.Ldiv_xnormalized
1293.Ldiv_xh_zero:
1294	do_nsau	a10, xl, a11, a8
1295	addi	a10, a10, -11
1296	movi	a8, -31
1297	sub	a8, a8, a10
1298	ssl	a10
1299	bltz	a10, .Ldiv_xl_srl
1300	sll	xh, xl
1301	movi	xl, 0
1302	j	.Ldiv_xnormalized
1303.Ldiv_xl_srl:
1304	srl	xh, xl
1305	sll	xl, xl
1306	j	.Ldiv_xnormalized
1307
1308.Ldiv_return_zero:
1309	/* Return zero with the appropriate sign bit.  */
1310	srli	xh, a7, 31
1311	slli	xh, xh, 31
1312	movi	xl, 0
1313	leaf_return
1314
1315.Ldiv_xnan_or_inf:
1316	/* Set the sign bit of the result.  */
1317	srli	a7, yh, 31
1318	slli	a7, a7, 31
1319	xor	xh, xh, a7
1320	/* If y is NaN or Inf, return NaN.  */
1321	bnall	yh, a6, 1f
1322	movi	a4, 0x80000	/* make it a quiet NaN */
1323	or	xh, xh, a4
13241:	leaf_return
1325
1326.Ldiv_ynan_or_inf:
1327	/* If y is Infinity, return zero.  */
1328	slli	a8, yh, 12
1329	or	a8, a8, yl
1330	beqz	a8, .Ldiv_return_zero
1331	/* y is NaN; return it.  */
1332	mov	xh, yh
1333	mov	xl, yl
1334	leaf_return
1335
1336.Ldiv_highequal1:
1337	bltu	xl, yl, 2f
1338	j	3f
1339
1340	.align	4
1341	.global	__divdf3
1342	.type	__divdf3, @function
1343__divdf3:
1344	leaf_entry sp, 16
1345	movi	a6, 0x7ff00000
1346
1347	/* Get the sign of the result.  */
1348	xor	a7, xh, yh
1349
1350	/* Check for NaN and infinity.  */
1351	ball	xh, a6, .Ldiv_xnan_or_inf
1352	ball	yh, a6, .Ldiv_ynan_or_inf
1353
1354	/* Extract the exponents.  */
1355	extui	a8, xh, 20, 11
1356	extui	a9, yh, 20, 11
1357
1358	beqz	a9, .Ldiv_yexpzero
1359.Ldiv_ynormalized:
1360	beqz	a8, .Ldiv_xexpzero
1361.Ldiv_xnormalized:
1362
1363	/* Subtract the exponents.  */
1364	sub	a8, a8, a9
1365
1366	/* Replace sign/exponent fields with explicit "1.0".  */
1367	movi	a10, 0x1fffff
1368	or	xh, xh, a6
1369	and	xh, xh, a10
1370	or	yh, yh, a6
1371	and	yh, yh, a10
1372
1373	/* Set SAR for left shift by one.  */
1374	ssai	(32 - 1)
1375
1376	/* The first digit of the mantissa division must be a one.
1377	   Shift x (and adjust the exponent) as needed to make this true.  */
1378	bltu	yh, xh, 3f
1379	beq	yh, xh, .Ldiv_highequal1
13802:	src	xh, xh, xl
1381	sll	xl, xl
1382	addi	a8, a8, -1
13833:
1384	/* Do the first subtraction and shift.  */
1385	sub	xh, xh, yh
1386	bgeu	xl, yl, 1f
1387	addi	xh, xh, -1
13881:	sub	xl, xl, yl
1389	src	xh, xh, xl
1390	sll	xl, xl
1391
1392	/* Put the quotient into a10/a11.  */
1393	movi	a10, 0
1394	movi	a11, 1
1395
1396	/* Divide one bit at a time for 52 bits.  */
1397	movi	a9, 52
1398#if XCHAL_HAVE_LOOPS
1399	loop	a9, .Ldiv_loopend
1400#endif
1401.Ldiv_loop:
1402	/* Shift the quotient << 1.  */
1403	src	a10, a10, a11
1404	sll	a11, a11
1405
1406	/* Is this digit a 0 or 1?  */
1407	bltu	xh, yh, 3f
1408	beq	xh, yh, .Ldiv_highequal2
1409
1410	/* Output a 1 and subtract.  */
14112:	addi	a11, a11, 1
1412	sub	xh, xh, yh
1413	bgeu	xl, yl, 1f
1414	addi	xh, xh, -1
14151:	sub	xl, xl, yl
1416
1417	/* Shift the dividend << 1.  */
14183:	src	xh, xh, xl
1419	sll	xl, xl
1420
1421#if !XCHAL_HAVE_LOOPS
1422	addi	a9, a9, -1
1423	bnez	a9, .Ldiv_loop
1424#endif
1425.Ldiv_loopend:
1426
1427	/* Add the exponent bias (less one to account for the explicit "1.0"
1428	   of the mantissa that will be added to the exponent in the final
1429	   result).  */
1430	movi	a9, 0x3fe
1431	add	a8, a8, a9
1432
1433	/* Check for over/underflow.  The value in a8 is one less than the
1434	   final exponent, so values in the range 0..7fd are OK here.  */
1435	addmi	a9, a9, 0x400	/* 0x7fe */
1436	bgeu	a8, a9, .Ldiv_overflow
1437
1438.Ldiv_round:
1439	/* Round.  The remainder (<< 1) is in xh/xl.  */
1440	bltu	xh, yh, .Ldiv_rounded
1441	beq	xh, yh, .Ldiv_highequal3
1442.Ldiv_roundup:
1443	addi	a11, a11, 1
1444	beqz	a11, .Ldiv_roundcarry
1445
1446.Ldiv_rounded:
1447	mov	xl, a11
1448	/* Add the exponent to the mantissa.  */
1449	slli	a8, a8, 20
1450	add	xh, a10, a8
1451
1452.Ldiv_addsign:
1453	/* Add the sign bit.  */
1454	srli	a7, a7, 31
1455	slli	a7, a7, 31
1456	or	xh, xh, a7
1457	leaf_return
1458
1459.Ldiv_highequal2:
1460	bgeu	xl, yl, 2b
1461	j	3b
1462
1463.Ldiv_highequal3:
1464	bltu	xl, yl, .Ldiv_rounded
1465	bne	xl, yl, .Ldiv_roundup
1466
1467	/* Remainder is exactly half the divisor.  Round even.  */
1468	addi	a11, a11, 1
1469	beqz	a11, .Ldiv_roundcarry
1470	srli	a11, a11, 1
1471	slli	a11, a11, 1
1472	j	.Ldiv_rounded
1473
1474.Ldiv_overflow:
1475	bltz	a8, .Ldiv_underflow
1476	/* Return +/- Infinity.  */
1477	addi	a8, a9, 1	/* 0x7ff */
1478	slli	xh, a8, 20
1479	movi	xl, 0
1480	j	.Ldiv_addsign
1481
1482.Ldiv_underflow:
1483	/* Create a subnormal value, where the exponent field contains zero,
1484	   but the effective exponent is 1.  The value of a8 is one less than
1485	   the actual exponent, so just negate it to get the shift amount.  */
1486	neg	a8, a8
1487	ssr	a8
1488	bgeui	a8, 32, .Ldiv_bigshift
1489
1490	/* Shift a10/a11 right.  Any bits that are shifted out of a11 are
1491	   saved in a6 for rounding the result.  */
1492	sll	a6, a11
1493	src	a11, a10, a11
1494	srl	a10, a10
1495	j	1f
1496
1497.Ldiv_bigshift:
1498	bgeui	a8, 64, .Ldiv_flush_to_zero
1499	sll	a9, a11		/* lost bits shifted out of a11 */
1500	src	a6, a10, a11
1501	srl	a11, a10
1502	movi	a10, 0
1503	or	xl, xl, a9
1504
1505	/* Set the exponent to zero.  */
15061:	movi	a8, 0
1507
1508	/* Pack any nonzero remainder (in xh/xl) into a6.  */
1509	or	xh, xh, xl
1510	beqz	xh, 1f
1511	movi	a9, 1
1512	or	a6, a6, a9
1513
1514	/* Round a10/a11 based on the bits shifted out into a6.  */
15151:	bgez	a6, .Ldiv_rounded
1516	addi	a11, a11, 1
1517	beqz	a11, .Ldiv_roundcarry
1518	slli	a6, a6, 1
1519	bnez	a6, .Ldiv_rounded
1520	srli	a11, a11, 1
1521	slli	a11, a11, 1
1522	j	.Ldiv_rounded
1523
1524.Ldiv_roundcarry:
1525	/* a11 is always zero when the rounding increment overflows, so
1526	   there's no need to round it to an even value.  */
1527	addi	a10, a10, 1
1528	/* Overflow to the exponent field is OK.  */
1529	j	.Ldiv_rounded
1530
1531.Ldiv_flush_to_zero:
1532	/* Return zero with the appropriate sign bit.  */
1533	srli	xh, a7, 31
1534	slli	xh, xh, 31
1535	movi	xl, 0
1536	leaf_return
1537
1538#endif /* L_divdf3 */
1539
1540#ifdef L_cmpdf2
1541
1542	/* Equal and Not Equal */
1543
1544	.align	4
1545	.global	__eqdf2
1546	.global	__nedf2
1547	.set	__nedf2, __eqdf2
1548	.type	__eqdf2, @function
1549__eqdf2:
1550	leaf_entry sp, 16
1551	bne	xl, yl, 2f
1552	bne	xh, yh, 4f
1553
1554	/* The values are equal but NaN != NaN.  Check the exponent.  */
1555	movi	a6, 0x7ff00000
1556	ball	xh, a6, 3f
1557
1558	/* Equal.  */
1559	movi	a2, 0
1560	leaf_return
1561
1562	/* Not equal.  */
15632:	movi	a2, 1
1564	leaf_return
1565
1566	/* Check if the mantissas are nonzero.  */
15673:	slli	a7, xh, 12
1568	or	a7, a7, xl
1569	j	5f
1570
1571	/* Check if x and y are zero with different signs.  */
15724:	or	a7, xh, yh
1573	slli	a7, a7, 1
1574	or	a7, a7, xl	/* xl == yl here */
1575
1576	/* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa
1577	   or x when exponent(x) = 0x7ff and x == y.  */
15785:	movi	a2, 0
1579	movi	a3, 1
1580	movnez	a2, a3, a7
1581	leaf_return
1582
1583
1584	/* Greater Than */
1585
1586	.align	4
1587	.global	__gtdf2
1588	.type	__gtdf2, @function
1589__gtdf2:
1590	leaf_entry sp, 16
1591	movi	a6, 0x7ff00000
1592	ball	xh, a6, 2f
15931:	bnall	yh, a6, .Lle_cmp
1594
1595	/* Check if y is a NaN.  */
1596	slli	a7, yh, 12
1597	or	a7, a7, yl
1598	beqz	a7, .Lle_cmp
1599	movi	a2, 0
1600	leaf_return
1601
1602	/* Check if x is a NaN.  */
16032:	slli	a7, xh, 12
1604	or	a7, a7, xl
1605	beqz	a7, 1b
1606	movi	a2, 0
1607	leaf_return
1608
1609
1610	/* Less Than or Equal */
1611
1612	.align	4
1613	.global	__ledf2
1614	.type	__ledf2, @function
1615__ledf2:
1616	leaf_entry sp, 16
1617	movi	a6, 0x7ff00000
1618	ball	xh, a6, 2f
16191:	bnall	yh, a6, .Lle_cmp
1620
1621	/* Check if y is a NaN.  */
1622	slli	a7, yh, 12
1623	or	a7, a7, yl
1624	beqz	a7, .Lle_cmp
1625	movi	a2, 1
1626	leaf_return
1627
1628	/* Check if x is a NaN.  */
16292:	slli	a7, xh, 12
1630	or	a7, a7, xl
1631	beqz	a7, 1b
1632	movi	a2, 1
1633	leaf_return
1634
1635.Lle_cmp:
1636	/* Check if x and y have different signs.  */
1637	xor	a7, xh, yh
1638	bltz	a7, .Lle_diff_signs
1639
1640	/* Check if x is negative.  */
1641	bltz	xh, .Lle_xneg
1642
1643	/* Check if x <= y.  */
1644	bltu	xh, yh, 4f
1645	bne	xh, yh, 5f
1646	bltu	yl, xl, 5f
16474:	movi	a2, 0
1648	leaf_return
1649
1650.Lle_xneg:
1651	/* Check if y <= x.  */
1652	bltu	yh, xh, 4b
1653	bne	yh, xh, 5f
1654	bgeu	xl, yl, 4b
16555:	movi	a2, 1
1656	leaf_return
1657
1658.Lle_diff_signs:
1659	bltz	xh, 4b
1660
1661	/* Check if both x and y are zero.  */
1662	or	a7, xh, yh
1663	slli	a7, a7, 1
1664	or	a7, a7, xl
1665	or	a7, a7, yl
1666	movi	a2, 1
1667	movi	a3, 0
1668	moveqz	a2, a3, a7
1669	leaf_return
1670
1671
1672	/* Greater Than or Equal */
1673
1674	.align	4
1675	.global	__gedf2
1676	.type	__gedf2, @function
1677__gedf2:
1678	leaf_entry sp, 16
1679	movi	a6, 0x7ff00000
1680	ball	xh, a6, 2f
16811:	bnall	yh, a6, .Llt_cmp
1682
1683	/* Check if y is a NaN.  */
1684	slli	a7, yh, 12
1685	or	a7, a7, yl
1686	beqz	a7, .Llt_cmp
1687	movi	a2, -1
1688	leaf_return
1689
1690	/* Check if x is a NaN.  */
16912:	slli	a7, xh, 12
1692	or	a7, a7, xl
1693	beqz	a7, 1b
1694	movi	a2, -1
1695	leaf_return
1696
1697
1698	/* Less Than */
1699
1700	.align	4
1701	.global	__ltdf2
1702	.type	__ltdf2, @function
1703__ltdf2:
1704	leaf_entry sp, 16
1705	movi	a6, 0x7ff00000
1706	ball	xh, a6, 2f
17071:	bnall	yh, a6, .Llt_cmp
1708
1709	/* Check if y is a NaN.  */
1710	slli	a7, yh, 12
1711	or	a7, a7, yl
1712	beqz	a7, .Llt_cmp
1713	movi	a2, 0
1714	leaf_return
1715
1716	/* Check if x is a NaN.  */
17172:	slli	a7, xh, 12
1718	or	a7, a7, xl
1719	beqz	a7, 1b
1720	movi	a2, 0
1721	leaf_return
1722
1723.Llt_cmp:
1724	/* Check if x and y have different signs.  */
1725	xor	a7, xh, yh
1726	bltz	a7, .Llt_diff_signs
1727
1728	/* Check if x is negative.  */
1729	bltz	xh, .Llt_xneg
1730
1731	/* Check if x < y.  */
1732	bltu	xh, yh, 4f
1733	bne	xh, yh, 5f
1734	bgeu	xl, yl, 5f
17354:	movi	a2, -1
1736	leaf_return
1737
1738.Llt_xneg:
1739	/* Check if y < x.  */
1740	bltu	yh, xh, 4b
1741	bne	yh, xh, 5f
1742	bltu	yl, xl, 4b
17435:	movi	a2, 0
1744	leaf_return
1745
1746.Llt_diff_signs:
1747	bgez	xh, 5b
1748
1749	/* Check if both x and y are nonzero.  */
1750	or	a7, xh, yh
1751	slli	a7, a7, 1
1752	or	a7, a7, xl
1753	or	a7, a7, yl
1754	movi	a2, 0
1755	movi	a3, -1
1756	movnez	a2, a3, a7
1757	leaf_return
1758
1759
1760	/* Unordered */
1761
1762	.align	4
1763	.global	__unorddf2
1764	.type	__unorddf2, @function
1765__unorddf2:
1766	leaf_entry sp, 16
1767	movi	a6, 0x7ff00000
1768	ball	xh, a6, 3f
17691:	ball	yh, a6, 4f
17702:	movi	a2, 0
1771	leaf_return
1772
17733:	slli	a7, xh, 12
1774	or	a7, a7, xl
1775	beqz	a7, 1b
1776	movi	a2, 1
1777	leaf_return
1778
17794:	slli	a7, yh, 12
1780	or	a7, a7, yl
1781	beqz	a7, 2b
1782	movi	a2, 1
1783	leaf_return
1784
1785#endif /* L_cmpdf2 */
1786
1787#ifdef L_fixdfsi
1788
1789	.align	4
1790	.global	__fixdfsi
1791	.type	__fixdfsi, @function
1792__fixdfsi:
1793	leaf_entry sp, 16
1794
1795	/* Check for NaN and Infinity.  */
1796	movi	a6, 0x7ff00000
1797	ball	xh, a6, .Lfixdfsi_nan_or_inf
1798
1799	/* Extract the exponent and check if 0 < (exp - 0x3fe) < 32.  */
1800	extui	a4, xh, 20, 11
1801	extui	a5, a6, 19, 10	/* 0x3fe */
1802	sub	a4, a4, a5
1803	bgei	a4, 32, .Lfixdfsi_maxint
1804	blti	a4, 1, .Lfixdfsi_zero
1805
1806	/* Add explicit "1.0" and shift << 11.  */
1807	or	a7, xh, a6
1808	ssai	(32 - 11)
1809	src	a5, a7, xl
1810
1811	/* Shift back to the right, based on the exponent.  */
1812	ssl	a4		/* shift by 32 - a4 */
1813	srl	a5, a5
1814
1815	/* Negate the result if sign != 0.  */
1816	neg	a2, a5
1817	movgez	a2, a5, a7
1818	leaf_return
1819
1820.Lfixdfsi_nan_or_inf:
1821	/* Handle Infinity and NaN.  */
1822	slli	a4, xh, 12
1823	or	a4, a4, xl
1824	beqz	a4, .Lfixdfsi_maxint
1825
1826	/* Translate NaN to +maxint.  */
1827	movi	xh, 0
1828
1829.Lfixdfsi_maxint:
1830	slli	a4, a6, 11	/* 0x80000000 */
1831	addi	a5, a4, -1	/* 0x7fffffff */
1832	movgez	a4, a5, xh
1833	mov	a2, a4
1834	leaf_return
1835
1836.Lfixdfsi_zero:
1837	movi	a2, 0
1838	leaf_return
1839
1840#endif /* L_fixdfsi */
1841
1842#ifdef L_fixdfdi
1843
1844	.align	4
1845	.global	__fixdfdi
1846	.type	__fixdfdi, @function
1847__fixdfdi:
1848	leaf_entry sp, 16
1849
1850	/* Check for NaN and Infinity.  */
1851	movi	a6, 0x7ff00000
1852	ball	xh, a6, .Lfixdfdi_nan_or_inf
1853
1854	/* Extract the exponent and check if 0 < (exp - 0x3fe) < 64.  */
1855	extui	a4, xh, 20, 11
1856	extui	a5, a6, 19, 10	/* 0x3fe */
1857	sub	a4, a4, a5
1858	bgei	a4, 64, .Lfixdfdi_maxint
1859	blti	a4, 1, .Lfixdfdi_zero
1860
1861	/* Add explicit "1.0" and shift << 11.  */
1862	or	a7, xh, a6
1863	ssai	(32 - 11)
1864	src	xh, a7, xl
1865	sll	xl, xl
1866
1867	/* Shift back to the right, based on the exponent.  */
1868	ssl	a4		/* shift by 64 - a4 */
1869	bgei	a4, 32, .Lfixdfdi_smallshift
1870	srl	xl, xh
1871	movi	xh, 0
1872
1873.Lfixdfdi_shifted:
1874	/* Negate the result if sign != 0.  */
1875	bgez	a7, 1f
1876	neg	xl, xl
1877	neg	xh, xh
1878	beqz	xl, 1f
1879	addi	xh, xh, -1
18801:	leaf_return
1881
1882.Lfixdfdi_smallshift:
1883	src	xl, xh, xl
1884	srl	xh, xh
1885	j	.Lfixdfdi_shifted
1886
1887.Lfixdfdi_nan_or_inf:
1888	/* Handle Infinity and NaN.  */
1889	slli	a4, xh, 12
1890	or	a4, a4, xl
1891	beqz	a4, .Lfixdfdi_maxint
1892
1893	/* Translate NaN to +maxint.  */
1894	movi	xh, 0
1895
1896.Lfixdfdi_maxint:
1897	slli	a7, a6, 11	/* 0x80000000 */
1898	bgez	xh, 1f
1899	mov	xh, a7
1900	movi	xl, 0
1901	leaf_return
1902
19031:	addi	xh, a7, -1	/* 0x7fffffff */
1904	movi	xl, -1
1905	leaf_return
1906
1907.Lfixdfdi_zero:
1908	movi	xh, 0
1909	movi	xl, 0
1910	leaf_return
1911
1912#endif /* L_fixdfdi */
1913
1914#ifdef L_fixunsdfsi
1915
1916	.align	4
1917	.global	__fixunsdfsi
1918	.type	__fixunsdfsi, @function
1919__fixunsdfsi:
1920	leaf_entry sp, 16
1921
1922	/* Check for NaN and Infinity.  */
1923	movi	a6, 0x7ff00000
1924	ball	xh, a6, .Lfixunsdfsi_nan_or_inf
1925
1926	/* Extract the exponent and check if 0 <= (exp - 0x3ff) < 32.  */
1927	extui	a4, xh, 20, 11
1928	extui	a5, a6, 20, 10	/* 0x3ff */
1929	sub	a4, a4, a5
1930	bgei	a4, 32, .Lfixunsdfsi_maxint
1931	bltz	a4, .Lfixunsdfsi_zero
1932
1933	/* Add explicit "1.0" and shift << 11.  */
1934	or	a7, xh, a6
1935	ssai	(32 - 11)
1936	src	a5, a7, xl
1937
1938	/* Shift back to the right, based on the exponent.  */
1939	addi	a4, a4, 1
1940	beqi	a4, 32, .Lfixunsdfsi_bigexp
1941	ssl	a4		/* shift by 32 - a4 */
1942	srl	a5, a5
1943
1944	/* Negate the result if sign != 0.  */
1945	neg	a2, a5
1946	movgez	a2, a5, a7
1947	leaf_return
1948
1949.Lfixunsdfsi_nan_or_inf:
1950	/* Handle Infinity and NaN.  */
1951	slli	a4, xh, 12
1952	or	a4, a4, xl
1953	beqz	a4, .Lfixunsdfsi_maxint
1954
1955	/* Translate NaN to 0xffffffff.  */
1956	movi	a2, -1
1957	leaf_return
1958
1959.Lfixunsdfsi_maxint:
1960	slli	a4, a6, 11	/* 0x80000000 */
1961	movi	a5, -1		/* 0xffffffff */
1962	movgez	a4, a5, xh
1963	mov	a2, a4
1964	leaf_return
1965
1966.Lfixunsdfsi_zero:
1967	movi	a2, 0
1968	leaf_return
1969
1970.Lfixunsdfsi_bigexp:
1971	/* Handle unsigned maximum exponent case.  */
1972	bltz	xh, 1f
1973	mov	a2, a5		/* no shift needed */
1974	leaf_return
1975
1976	/* Return 0x80000000 if negative.  */
19771:	slli	a2, a6, 11
1978	leaf_return
1979
1980#endif /* L_fixunsdfsi */
1981
1982#ifdef L_fixunsdfdi
1983
1984	.align	4
1985	.global	__fixunsdfdi
1986	.type	__fixunsdfdi, @function
1987__fixunsdfdi:
1988	leaf_entry sp, 16
1989
1990	/* Check for NaN and Infinity.  */
1991	movi	a6, 0x7ff00000
1992	ball	xh, a6, .Lfixunsdfdi_nan_or_inf
1993
1994	/* Extract the exponent and check if 0 <= (exp - 0x3ff) < 64.  */
1995	extui	a4, xh, 20, 11
1996	extui	a5, a6, 20, 10	/* 0x3ff */
1997	sub	a4, a4, a5
1998	bgei	a4, 64, .Lfixunsdfdi_maxint
1999	bltz	a4, .Lfixunsdfdi_zero
2000
2001	/* Add explicit "1.0" and shift << 11.  */
2002	or	a7, xh, a6
2003	ssai	(32 - 11)
2004	src	xh, a7, xl
2005	sll	xl, xl
2006
2007	/* Shift back to the right, based on the exponent.  */
2008	addi	a4, a4, 1
2009	beqi	a4, 64, .Lfixunsdfdi_bigexp
2010	ssl	a4		/* shift by 64 - a4 */
2011	bgei	a4, 32, .Lfixunsdfdi_smallshift
2012	srl	xl, xh
2013	movi	xh, 0
2014
2015.Lfixunsdfdi_shifted:
2016	/* Negate the result if sign != 0.  */
2017	bgez	a7, 1f
2018	neg	xl, xl
2019	neg	xh, xh
2020	beqz	xl, 1f
2021	addi	xh, xh, -1
20221:	leaf_return
2023
2024.Lfixunsdfdi_smallshift:
2025	src	xl, xh, xl
2026	srl	xh, xh
2027	j	.Lfixunsdfdi_shifted
2028
2029.Lfixunsdfdi_nan_or_inf:
2030	/* Handle Infinity and NaN.  */
2031	slli	a4, xh, 12
2032	or	a4, a4, xl
2033	beqz	a4, .Lfixunsdfdi_maxint
2034
2035	/* Translate NaN to 0xffffffff.... */
20361:	movi	xh, -1
2037	movi	xl, -1
2038	leaf_return
2039
2040.Lfixunsdfdi_maxint:
2041	bgez	xh, 1b
20422:	slli	xh, a6, 11	/* 0x80000000 */
2043	movi	xl, 0
2044	leaf_return
2045
2046.Lfixunsdfdi_zero:
2047	movi	xh, 0
2048	movi	xl, 0
2049	leaf_return
2050
2051.Lfixunsdfdi_bigexp:
2052	/* Handle unsigned maximum exponent case.  */
2053	bltz	a7, 2b
2054	leaf_return		/* no shift needed */
2055
2056#endif /* L_fixunsdfdi */
2057
2058#ifdef L_floatsidf
2059
2060	.align	4
2061	.global	__floatunsidf
2062	.type	__floatunsidf, @function
2063__floatunsidf:
2064	leaf_entry sp, 16
2065	beqz	a2, .Lfloatsidf_return_zero
2066
2067	/* Set the sign to zero and jump to the floatsidf code.  */
2068	movi	a7, 0
2069	j	.Lfloatsidf_normalize
2070
2071	.align	4
2072	.global	__floatsidf
2073	.type	__floatsidf, @function
2074__floatsidf:
2075	leaf_entry sp, 16
2076
2077	/* Check for zero.  */
2078	beqz	a2, .Lfloatsidf_return_zero
2079
2080	/* Save the sign.  */
2081	extui	a7, a2, 31, 1
2082
2083	/* Get the absolute value.  */
2084#if XCHAL_HAVE_ABS
2085	abs	a2, a2
2086#else
2087	neg	a4, a2
2088	movltz	a2, a4, a2
2089#endif
2090
2091.Lfloatsidf_normalize:
2092	/* Normalize with the first 1 bit in the msb.  */
2093	do_nsau	a4, a2, a5, a6
2094	ssl	a4
2095	sll	a5, a2
2096
2097	/* Shift the mantissa into position.  */
2098	srli	xh, a5, 11
2099	slli	xl, a5, (32 - 11)
2100
2101	/* Set the exponent.  */
2102	movi	a5, 0x41d	/* 0x3fe + 31 */
2103	sub	a5, a5, a4
2104	slli	a5, a5, 20
2105	add	xh, xh, a5
2106
2107	/* Add the sign and return. */
2108	slli	a7, a7, 31
2109	or	xh, xh, a7
2110	leaf_return
2111
2112.Lfloatsidf_return_zero:
2113	movi	a3, 0
2114	leaf_return
2115
2116#endif /* L_floatsidf */
2117
2118#ifdef L_floatdidf
2119
2120	.align	4
2121	.global	__floatundidf
2122	.type	__floatundidf, @function
2123__floatundidf:
2124	leaf_entry sp, 16
2125
2126	/* Check for zero.  */
2127	or	a4, xh, xl
2128	beqz	a4, 2f
2129
2130	/* Set the sign to zero and jump to the floatdidf code.  */
2131	movi	a7, 0
2132	j	.Lfloatdidf_normalize
2133
2134	.align	4
2135	.global	__floatdidf
2136	.type	__floatdidf, @function
2137__floatdidf:
2138	leaf_entry sp, 16
2139
2140	/* Check for zero.  */
2141	or	a4, xh, xl
2142	beqz	a4, 2f
2143
2144	/* Save the sign.  */
2145	extui	a7, xh, 31, 1
2146
2147	/* Get the absolute value.  */
2148	bgez	xh, .Lfloatdidf_normalize
2149	neg	xl, xl
2150	neg	xh, xh
2151	beqz	xl, .Lfloatdidf_normalize
2152	addi	xh, xh, -1
2153
2154.Lfloatdidf_normalize:
2155	/* Normalize with the first 1 bit in the msb of xh.  */
2156	beqz	xh, .Lfloatdidf_bigshift
2157	do_nsau	a4, xh, a5, a6
2158	ssl	a4
2159	src	xh, xh, xl
2160	sll	xl, xl
2161
2162.Lfloatdidf_shifted:
2163	/* Shift the mantissa into position, with rounding bits in a6.  */
2164	ssai	11
2165	sll	a6, xl
2166	src	xl, xh, xl
2167	srl	xh, xh
2168
2169	/* Set the exponent.  */
2170	movi	a5, 0x43d	/* 0x3fe + 63 */
2171	sub	a5, a5, a4
2172	slli	a5, a5, 20
2173	add	xh, xh, a5
2174
2175	/* Add the sign.  */
2176	slli	a7, a7, 31
2177	or	xh, xh, a7
2178
2179	/* Round up if the leftover fraction is >= 1/2.  */
2180	bgez	a6, 2f
2181	addi	xl, xl, 1
2182	beqz	xl, .Lfloatdidf_roundcarry
2183
2184	/* Check if the leftover fraction is exactly 1/2.  */
2185	slli	a6, a6, 1
2186	beqz	a6, .Lfloatdidf_exactlyhalf
21872:	leaf_return
2188
2189.Lfloatdidf_bigshift:
2190	/* xh is zero.  Normalize with first 1 bit of xl in the msb of xh.  */
2191	do_nsau	a4, xl, a5, a6
2192	ssl	a4
2193	sll	xh, xl
2194	movi	xl, 0
2195	addi	a4, a4, 32
2196	j	.Lfloatdidf_shifted
2197
2198.Lfloatdidf_exactlyhalf:
2199	/* Round down to the nearest even value.  */
2200	srli	xl, xl, 1
2201	slli	xl, xl, 1
2202	leaf_return
2203
2204.Lfloatdidf_roundcarry:
2205	/* xl is always zero when the rounding increment overflows, so
2206	   there's no need to round it to an even value.  */
2207	addi	xh, xh, 1
2208	/* Overflow to the exponent is OK.  */
2209	leaf_return
2210
2211#endif /* L_floatdidf */
2212
2213#ifdef L_truncdfsf2
2214
2215	.align	4
2216	.global	__truncdfsf2
2217	.type	__truncdfsf2, @function
2218__truncdfsf2:
2219	leaf_entry sp, 16
2220
2221	/* Adjust the exponent bias.  */
2222	movi	a4, (0x3ff - 0x7f) << 20
2223	sub	a5, xh, a4
2224
2225	/* Check for underflow.  */
2226	xor	a6, xh, a5
2227	bltz	a6, .Ltrunc_underflow
2228	extui	a6, a5, 20, 11
2229	beqz	a6, .Ltrunc_underflow
2230
2231	/* Check for overflow.  */
2232	movi	a4, 255
2233	bge	a6, a4, .Ltrunc_overflow
2234
2235	/* Shift a5/xl << 3 into a5/a4.  */
2236	ssai	(32 - 3)
2237	src	a5, a5, xl
2238	sll	a4, xl
2239
2240.Ltrunc_addsign:
2241	/* Add the sign bit.  */
2242	extui	a6, xh, 31, 1
2243	slli	a6, a6, 31
2244	or	a2, a6, a5
2245
2246	/* Round up if the leftover fraction is >= 1/2.  */
2247	bgez	a4, 1f
2248	addi	a2, a2, 1
2249	/* Overflow to the exponent is OK.  The answer will be correct.  */
2250
2251	/* Check if the leftover fraction is exactly 1/2.  */
2252	slli	a4, a4, 1
2253	beqz	a4, .Ltrunc_exactlyhalf
22541:	leaf_return
2255
2256.Ltrunc_exactlyhalf:
2257	/* Round down to the nearest even value.  */
2258	srli	a2, a2, 1
2259	slli	a2, a2, 1
2260	leaf_return
2261
2262.Ltrunc_overflow:
2263	/* Check if exponent == 0x7ff.  */
2264	movi	a4, 0x7ff00000
2265	bnall	xh, a4, 1f
2266
2267	/* Check if mantissa is nonzero.  */
2268	slli	a5, xh, 12
2269	or	a5, a5, xl
2270	beqz	a5, 1f
2271
2272	/* Shift a4 to set a bit in the mantissa, making a quiet NaN.  */
2273	srli	a4, a4, 1
2274
22751:	slli	a4, a4, 4	/* 0xff000000 or 0xff800000 */
2276	/* Add the sign bit.  */
2277	extui	a6, xh, 31, 1
2278	ssai	1
2279	src	a2, a6, a4
2280	leaf_return
2281
2282.Ltrunc_underflow:
2283	/* Find shift count for a subnormal.  Flush to zero if >= 32.  */
2284	extui	a6, xh, 20, 11
2285	movi	a5, 0x3ff - 0x7f
2286	sub	a6, a5, a6
2287	addi	a6, a6, 1
2288	bgeui	a6, 32, 1f
2289
2290	/* Replace the exponent with an explicit "1.0".  */
2291	slli	a5, a5, 13	/* 0x700000 */
2292	or	a5, a5, xh
2293	slli	a5, a5, 11
2294	srli	a5, a5, 11
2295
2296	/* Shift the mantissa left by 3 bits (into a5/a4).  */
2297	ssai	(32 - 3)
2298	src	a5, a5, xl
2299	sll	a4, xl
2300
2301	/* Shift right by a6.  */
2302	ssr	a6
2303	sll	a7, a4
2304	src	a4, a5, a4
2305	srl	a5, a5
2306	beqz	a7, .Ltrunc_addsign
2307	or	a4, a4, a6	/* any positive, nonzero value will work */
2308	j	.Ltrunc_addsign
2309
2310	/* Return +/- zero.  */
23111:	extui	a2, xh, 31, 1
2312	slli	a2, a2, 31
2313	leaf_return
2314
2315#endif /* L_truncdfsf2 */
2316
2317#ifdef L_extendsfdf2
2318
2319	.align	4
2320	.global	__extendsfdf2
2321	.type	__extendsfdf2, @function
2322__extendsfdf2:
2323	leaf_entry sp, 16
2324
2325	/* Save the sign bit and then shift it off.  */
2326	extui	a5, a2, 31, 1
2327	slli	a5, a5, 31
2328	slli	a4, a2, 1
2329
2330	/* Extract and check the exponent.  */
2331	extui	a6, a2, 23, 8
2332	beqz	a6, .Lextend_expzero
2333	addi	a6, a6, 1
2334	beqi	a6, 256, .Lextend_nan_or_inf
2335
2336	/* Shift >> 3 into a4/xl.  */
2337	srli	a4, a4, 4
2338	slli	xl, a2, (32 - 3)
2339
2340	/* Adjust the exponent bias.  */
2341	movi	a6, (0x3ff - 0x7f) << 20
2342	add	a4, a4, a6
2343
2344	/* Add the sign bit.  */
2345	or	xh, a4, a5
2346	leaf_return
2347
2348.Lextend_nan_or_inf:
2349	movi	a4, 0x7ff00000
2350
2351	/* Check for NaN.  */
2352	slli	a7, a2, 9
2353	beqz	a7, 1f
2354
2355	slli	a6, a6, 11	/* 0x80000 */
2356	or	a4, a4, a6
2357
2358	/* Add the sign and return.  */
23591:	or	xh, a4, a5
2360	movi	xl, 0
2361	leaf_return
2362
2363.Lextend_expzero:
2364	beqz	a4, 1b
2365
2366	/* Normalize it to have 8 zero bits before the first 1 bit.  */
2367	do_nsau	a7, a4, a2, a3
2368	addi	a7, a7, -8
2369	ssl	a7
2370	sll	a4, a4
2371
2372	/* Shift >> 3 into a4/xl.  */
2373	slli	xl, a4, (32 - 3)
2374	srli	a4, a4, 3
2375
2376	/* Set the exponent.  */
2377	movi	a6, 0x3fe - 0x7f
2378	sub	a6, a6, a7
2379	slli	a6, a6, 20
2380	add	a4, a4, a6
2381
2382	/* Add the sign and return.  */
2383	or	xh, a4, a5
2384	leaf_return
2385
2386#endif /* L_extendsfdf2 */
2387
2388
2389