1/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2
3Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
42004, 2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
5
6This file is free software; you can redistribute it and/or modify it under the
7terms of the GNU Lesser General Public License as published by the Free
8Software Foundation; either version 3 of the License, or (at your option) any
9later version.
10
11This file is distributed in the hope that it will be useful, but WITHOUT ANY
12WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
13PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
14details.
15
16You should have received a copy of the GNU Lesser General Public License
17along with this file.  If not, see http://www.gnu.org/licenses/.  */
18
19/* You have to define the following before including this file:
20
21   UWtype -- An unsigned type, default type for operations (typically a "word")
22   UHWtype -- An unsigned type, at least half the size of UWtype
23   UDWtype -- An unsigned type, at least twice as large a UWtype
24   W_TYPE_SIZE -- size in bits of UWtype
25
26   SItype, USItype -- Signed and unsigned 32 bit types
27   DItype, UDItype -- Signed and unsigned 64 bit types
28
29   On a 32 bit machine UWtype should typically be USItype;
30   on a 64 bit machine, UWtype should typically be UDItype.
31
32   Optionally, define:
33
34   LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
35   NO_ASM -- Disable inline asm
36
37
38   CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
39   need to include gmp.h and gmp-impl.h, or certain things might not work as
40   expected.
41*/
42
43#define __BITS4 (W_TYPE_SIZE / 4)
44#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
45#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
46#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
47
48/* This is used to make sure no undesirable sharing between different libraries
49   that use this file takes place.  */
50#ifndef __MPN
51#define __MPN(x) __##x
52#endif
53
54#ifndef _PROTO
55#if (__STDC__-0) || defined (__cplusplus)
56#define _PROTO(x) x
57#else
58#define _PROTO(x) ()
59#endif
60#endif
61
62/* Define auxiliary asm macros.
63
64   1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
65   UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
66   word product in HIGH_PROD and LOW_PROD.
67
68   2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
69   UDWtype product.  This is just a variant of umul_ppmm.
70
71   3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
72   denominator) divides a UDWtype, composed by the UWtype integers
73   HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
74   in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
75   than DENOMINATOR for correct operation.  If, in addition, the most
76   significant bit of DENOMINATOR must be 1, then the pre-processor symbol
77   UDIV_NEEDS_NORMALIZATION is defined to 1.
78
79   4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
80   denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
81   is rounded towards 0.
82
83   5) count_leading_zeros(count, x) counts the number of zero-bits from the
84   msb to the first non-zero bit in the UWtype X.  This is the number of
85   steps X needs to be shifted left to set the msb.  Undefined for X == 0,
86   unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
87
88   6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
89   from the least significant end.
90
91   7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
92   high_addend_2, low_addend_2) adds two UWtype integers, composed by
93   HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
94   respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
95   (i.e. carry out) is not stored anywhere, and is lost.
96
97   8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
98   high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
99   composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
100   LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
101   and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
102   and is lost.
103
104   If any of these macros are left undefined for a particular CPU,
105   C macros are used.
106
107
108   Notes:
109
110   For add_ssaaaa the two high and two low addends can both commute, but
111   unfortunately gcc only supports one "%" commutative in each asm block.
112   This has always been so but is only documented in recent versions
113   (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
114   compiler error in certain rare circumstances.
115
116   Apparently it was only the last "%" that was ever actually respected, so
117   the code has been updated to leave just that.  Clearly there's a free
118   choice whether high or low should get it, if there's a reason to favour
119   one over the other.  Also obviously when the constraints on the two
120   operands are identical there's no benefit to the reloader in any "%" at
121   all.
122
123   */
124
125/* The CPUs come in alphabetical order below.
126
127   Please add support for more CPUs here, or improve the current support
128   for the CPUs below!  */
129
130
131/* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
132   3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
133   Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
134   __builtin_ctzll.
135
136   These builtins are only used when we check what code comes out, on some
137   chips they're merely libgcc calls, where we will instead want an inline
138   in that case (either asm or generic C).
139
140   These builtins are better than an asm block of the same insn, since an
141   asm block doesn't give gcc any information about scheduling or resource
142   usage.  We keep an asm block for use on prior versions of gcc though.
143
144   For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
145   it's not used (for count_leading_zeros) because it generally gives extra
146   code to ensure the result is 0 when the input is 0, which we don't need
147   or want.  */
148
149#ifdef _LONG_LONG_LIMB
150#define count_leading_zeros_gcc_clz(count,x)    \
151  do {                                          \
152    ASSERT ((x) != 0);                          \
153    (count) = __builtin_clzll (x);              \
154  } while (0)
155#else
156#define count_leading_zeros_gcc_clz(count,x)    \
157  do {                                          \
158    ASSERT ((x) != 0);                          \
159    (count) = __builtin_clzl (x);               \
160  } while (0)
161#endif
162
163#ifdef _LONG_LONG_LIMB
164#define count_trailing_zeros_gcc_ctz(count,x)   \
165  do {                                          \
166    ASSERT ((x) != 0);                          \
167    (count) = __builtin_ctzll (x);              \
168  } while (0)
169#else
170#define count_trailing_zeros_gcc_ctz(count,x)   \
171  do {                                          \
172    ASSERT ((x) != 0);                          \
173    (count) = __builtin_ctzl (x);               \
174  } while (0)
175#endif
176
177
178/* FIXME: The macros using external routines like __MPN(count_leading_zeros)
179   don't need to be under !NO_ASM */
180#if ! defined (NO_ASM)
181
182#if defined (__alpha) && W_TYPE_SIZE == 64
183/* Most alpha-based machines, except Cray systems. */
184#if defined (__GNUC__)
185#if __GMP_GNUC_PREREQ (3,3)
186#define umul_ppmm(ph, pl, m0, m1) \
187  do {									\
188    UDItype __m0 = (m0), __m1 = (m1);					\
189    (ph) = __builtin_alpha_umulh (__m0, __m1);				\
190    (pl) = __m0 * __m1;							\
191  } while (0)
192#else
193#define umul_ppmm(ph, pl, m0, m1) \
194  do {									\
195    UDItype __m0 = (m0), __m1 = (m1);					\
196    __asm__ ("umulh %r1,%2,%0"						\
197	     : "=r" (ph)						\
198	     : "%rJ" (m0), "rI" (m1));					\
199    (pl) = __m0 * __m1;							\
200  } while (0)
201#endif
202#define UMUL_TIME 18
203#else /* ! __GNUC__ */
204#include <machine/builtins.h>
205#define umul_ppmm(ph, pl, m0, m1) \
206  do {									\
207    UDItype __m0 = (m0), __m1 = (m1);					\
208    (ph) = __UMULH (m0, m1);						\
209    (pl) = __m0 * __m1;							\
210  } while (0)
211#endif
212#ifndef LONGLONG_STANDALONE
213#define udiv_qrnnd(q, r, n1, n0, d) \
214  do { UWtype __di;							\
215    __di = __MPN(invert_limb) (d);					\
216    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
217  } while (0)
218#define UDIV_PREINV_ALWAYS  1
219#define UDIV_NEEDS_NORMALIZATION 1
220#define UDIV_TIME 220
221#endif /* LONGLONG_STANDALONE */
222
223/* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
224   always goes into libgmp.so, even when not actually used.  */
225#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
226
227#if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
228#define count_leading_zeros(COUNT,X) \
229  __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
230#define count_trailing_zeros(COUNT,X) \
231  __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
232#endif /* clz/ctz using cix */
233
234#if ! defined (count_leading_zeros)                             \
235  && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
236/* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
237   "$31" is written explicitly in the asm, since an "r" constraint won't
238   select reg 31.  There seems no need to worry about "r31" syntax for cray,
239   since gcc itself (pre-release 3.4) emits just $31 in various places.  */
240#define ALPHA_CMPBGE_0(dst, src)                                        \
241  do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
242/* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
243   them, locating the highest non-zero byte.  A second __clz_tab lookup
244   counts the leading zero bits in that byte, giving the result.  */
245#define count_leading_zeros(count, x)                                   \
246  do {                                                                  \
247    UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
248    ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
249    __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
250    __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
251    __clz__x >>= __clz__b;                                              \
252    __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
253    __clz__b = 65 - __clz__b;                                           \
254    (count) = __clz__b - __clz__c;                                      \
255  } while (0)
256#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
257#endif /* clz using cmpbge */
258
259#if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
260#if HAVE_ATTRIBUTE_CONST
261long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
262#else
263long __MPN(count_leading_zeros) _PROTO ((UDItype));
264#endif
265#define count_leading_zeros(count, x) \
266  ((count) = __MPN(count_leading_zeros) (x))
267#endif /* clz using mpn */
268#endif /* __alpha */
269
270#if defined (_CRAY) && W_TYPE_SIZE == 64
271#include <intrinsics.h>
272#define UDIV_PREINV_ALWAYS  1
273#define UDIV_NEEDS_NORMALIZATION 1
274#define UDIV_TIME 220
275long __MPN(count_leading_zeros) _PROTO ((UDItype));
276#define count_leading_zeros(count, x) \
277  ((count) = _leadz ((UWtype) (x)))
278#if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
279#define umul_ppmm(ph, pl, m0, m1) \
280  do {									\
281    UDItype __m0 = (m0), __m1 = (m1);					\
282    (ph) = _int_mult_upper (m0, m1);					\
283    (pl) = __m0 * __m1;							\
284  } while (0)
285#ifndef LONGLONG_STANDALONE
286#define udiv_qrnnd(q, r, n1, n0, d) \
287  do { UWtype __di;							\
288    __di = __MPN(invert_limb) (d);					\
289    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
290  } while (0)
291#endif /* LONGLONG_STANDALONE */
292#endif /* _CRAYIEEE */
293#endif /* _CRAY */
294
295#if defined (__ia64) && W_TYPE_SIZE == 64
296/* This form encourages gcc (pre-release 3.4 at least) to emit predicated
297   "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
298   code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
299   register, which takes an extra cycle.  */
300#define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
301  do {                                          \
302    UWtype __x;                                 \
303    __x = (al) - (bl);                          \
304    if ((al) < (bl))                            \
305      (sh) = (ah) - (bh) - 1;                   \
306    else                                        \
307      (sh) = (ah) - (bh);                       \
308    (sl) = __x;                                 \
309  } while (0)
310#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
311/* Do both product parts in assembly, since that gives better code with
312   all gcc versions.  Some callers will just use the upper part, and in
313   that situation we waste an instruction, but not any cycles.  */
314#define umul_ppmm(ph, pl, m0, m1) \
315    __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
316	     : "=&f" (ph), "=f" (pl)					\
317	     : "f" (m0), "f" (m1))
318#define UMUL_TIME 14
319#define count_leading_zeros(count, x) \
320  do {									\
321    UWtype _x = (x), _y, _a, _c;					\
322    __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
323    __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
324    _c = (_a - 1) << 3;							\
325    _x >>= _c;								\
326    if (_x >= 1 << 4)							\
327      _x >>= 4, _c += 4;						\
328    if (_x >= 1 << 2)							\
329      _x >>= 2, _c += 2;						\
330    _c += _x >> 1;							\
331    (count) =  W_TYPE_SIZE - 1 - _c;					\
332  } while (0)
333/* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
334   based, and we don't need a special case for x==0 here */
335#define count_trailing_zeros(count, x)					\
336  do {									\
337    UWtype __ctz_x = (x);						\
338    __asm__ ("popcnt %0 = %1"						\
339	     : "=r" (count)						\
340	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
341  } while (0)
342#endif
343#if defined (__INTEL_COMPILER)
344#include <ia64intrin.h>
345#define umul_ppmm(ph, pl, m0, m1)					\
346  do {									\
347    UWtype _m0 = (m0), _m1 = (m1);					\
348    ph = _m64_xmahu (_m0, _m1, 0);					\
349    pl = _m0 * _m1;							\
350  } while (0)
351#endif
352#ifndef LONGLONG_STANDALONE
353#define udiv_qrnnd(q, r, n1, n0, d) \
354  do { UWtype __di;							\
355    __di = __MPN(invert_limb) (d);					\
356    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
357  } while (0)
358#define UDIV_PREINV_ALWAYS  1
359#define UDIV_NEEDS_NORMALIZATION 1
360#endif
361#define UDIV_TIME 220
362#endif
363
364
365#if defined (__GNUC__)
366
367/* We sometimes need to clobber "cc" with gcc2, but that would not be
368   understood by gcc1.  Use cpp to avoid major code duplication.  */
369#if __GNUC__ < 2
370#define __CLOBBER_CC
371#define __AND_CLOBBER_CC
372#else /* __GNUC__ >= 2 */
373#define __CLOBBER_CC : "cc"
374#define __AND_CLOBBER_CC , "cc"
375#endif /* __GNUC__ < 2 */
376
377#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
378#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
379  __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
380	   : "=r" (sh), "=&r" (sl)					\
381	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
382#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
383  __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
384	   : "=r" (sh), "=&r" (sl)					\
385	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
386#define umul_ppmm(xh, xl, m0, m1) \
387  do {									\
388    USItype __m0 = (m0), __m1 = (m1);					\
389    __asm__ ("multiplu %0,%1,%2"					\
390	     : "=r" (xl)						\
391	     : "r" (__m0), "r" (__m1));					\
392    __asm__ ("multmu %0,%1,%2"						\
393	     : "=r" (xh)						\
394	     : "r" (__m0), "r" (__m1));					\
395  } while (0)
396#define udiv_qrnnd(q, r, n1, n0, d) \
397  __asm__ ("dividu %0,%3,%4"						\
398	   : "=r" (q), "=q" (r)						\
399	   : "1" (n1), "r" (n0), "r" (d))
400#define count_leading_zeros(count, x) \
401    __asm__ ("clz %0,%1"						\
402	     : "=r" (count)						\
403	     : "r" (x))
404#define COUNT_LEADING_ZEROS_0 32
405#endif /* __a29k__ */
406
407#if defined (__arc__)
408#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
409  __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
410	   : "=r" (sh),							\
411	     "=&r" (sl)							\
412	   : "r"  ((USItype) (ah)),					\
413	     "rIJ" ((USItype) (bh)),					\
414	     "%r" ((USItype) (al)),					\
415	     "rIJ" ((USItype) (bl)))
416#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
417  __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
418	   : "=r" (sh),							\
419	     "=&r" (sl)							\
420	   : "r" ((USItype) (ah)),					\
421	     "rIJ" ((USItype) (bh)),					\
422	     "r" ((USItype) (al)),					\
423	     "rIJ" ((USItype) (bl)))
424#endif
425
426#if defined (__arm__) && W_TYPE_SIZE == 32
427#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
428  __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
429	   : "=r" (sh), "=&r" (sl)					\
430	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
431#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
432  do {									\
433    if (__builtin_constant_p (al))					\
434      {									\
435	if (__builtin_constant_p (ah))					\
436	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
437		   : "=r" (sh), "=&r" (sl)				\
438		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
439	else								\
440	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
441		   : "=r" (sh), "=&r" (sl)				\
442		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
443      }									\
444    else if (__builtin_constant_p (ah))					\
445      {									\
446	if (__builtin_constant_p (bl))					\
447	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
448		   : "=r" (sh), "=&r" (sl)				\
449		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
450	else								\
451	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
452		   : "=r" (sh), "=&r" (sl)				\
453		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
454      }									\
455    else if (__builtin_constant_p (bl))					\
456      {									\
457	if (__builtin_constant_p (bh))					\
458	  __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"		\
459		   : "=r" (sh), "=&r" (sl)				\
460		   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
461	else								\
462	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
463		   : "=r" (sh), "=&r" (sl)				\
464		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
465      }									\
466    else /* only bh might be a constant */				\
467      __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
468	       : "=r" (sh), "=&r" (sl)					\
469	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
470    } while (0)
471#if 1 || defined (__arm_m__)	/* `M' series has widening multiply support */
472#define umul_ppmm(xh, xl, a, b) \
473  __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
474#define UMUL_TIME 5
475#define smul_ppmm(xh, xl, a, b) \
476  __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
477#ifndef LONGLONG_STANDALONE
478#define udiv_qrnnd(q, r, n1, n0, d) \
479  do { UWtype __di;							\
480    __di = __MPN(invert_limb) (d);					\
481    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
482  } while (0)
483#define UDIV_PREINV_ALWAYS  1
484#define UDIV_NEEDS_NORMALIZATION 1
485#define UDIV_TIME 70
486#endif /* LONGLONG_STANDALONE */
487#else
488#define umul_ppmm(xh, xl, a, b) \
489  __asm__ ("%@ Inlined umul_ppmm\n"					\
490"	mov	%|r0, %2, lsr #16\n"					\
491"	mov	%|r2, %3, lsr #16\n"					\
492"	bic	%|r1, %2, %|r0, lsl #16\n"				\
493"	bic	%|r2, %3, %|r2, lsl #16\n"				\
494"	mul	%1, %|r1, %|r2\n"					\
495"	mul	%|r2, %|r0, %|r2\n"					\
496"	mul	%|r1, %0, %|r1\n"					\
497"	mul	%0, %|r0, %0\n"						\
498"	adds	%|r1, %|r2, %|r1\n"					\
499"	addcs	%0, %0, #65536\n"					\
500"	adds	%1, %1, %|r1, lsl #16\n"				\
501"	adc	%0, %0, %|r1, lsr #16"					\
502	   : "=&r" (xh), "=r" (xl)					\
503	   : "r" (a), "r" (b)						\
504	   : "r0", "r1", "r2")
505#define UMUL_TIME 20
506#ifndef LONGLONG_STANDALONE
507#define udiv_qrnnd(q, r, n1, n0, d) \
508  do { UWtype __r;							\
509    (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
510    (r) = __r;								\
511  } while (0)
512extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
513#define UDIV_TIME 200
514#endif /* LONGLONG_STANDALONE */
515#endif
516#if defined (__ARM_ARCH_5__)
517/* This actually requires arm 5 */
518#define count_leading_zeros(count, x) \
519  __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
520#define COUNT_LEADING_ZEROS_0 32
521#endif
522#endif /* __arm__ */
523
524#if defined (__clipper__) && W_TYPE_SIZE == 32
525#define umul_ppmm(w1, w0, u, v) \
526  ({union {UDItype __ll;						\
527	   struct {USItype __l, __h;} __i;				\
528	  } __x;							\
529  __asm__ ("mulwux %2,%0"						\
530	   : "=r" (__x.__ll)						\
531	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
532  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
533#define smul_ppmm(w1, w0, u, v) \
534  ({union {DItype __ll;							\
535	   struct {SItype __l, __h;} __i;				\
536	  } __x;							\
537  __asm__ ("mulwx %2,%0"						\
538	   : "=r" (__x.__ll)						\
539	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
540  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
541#define __umulsidi3(u, v) \
542  ({UDItype __w;							\
543    __asm__ ("mulwux %2,%0"						\
544	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
545    __w; })
546#endif /* __clipper__ */
547
548/* Fujitsu vector computers.  */
549#if defined (__uxp__) && W_TYPE_SIZE == 32
550#define umul_ppmm(ph, pl, u, v) \
551  do {									\
552    union {UDItype __ll;						\
553	   struct {USItype __h, __l;} __i;				\
554	  } __x;							\
555    __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
556    (ph) = __x.__i.__h;							\
557    (pl) = __x.__i.__l;							\
558  } while (0)
559#define smul_ppmm(ph, pl, u, v) \
560  do {									\
561    union {UDItype __ll;						\
562	   struct {USItype __h, __l;} __i;				\
563	  } __x;							\
564    __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
565    (ph) = __x.__i.__h;							\
566    (pl) = __x.__i.__l;							\
567  } while (0)
568#endif
569
570#if defined (__gmicro__) && W_TYPE_SIZE == 32
571#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
572  __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
573	   : "=g" (sh), "=&g" (sl)					\
574	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
575	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
576#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
577  __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
578	   : "=g" (sh), "=&g" (sl)					\
579	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
580	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
581#define umul_ppmm(ph, pl, m0, m1) \
582  __asm__ ("mulx %3,%0,%1"						\
583	   : "=g" (ph), "=r" (pl)					\
584	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
585#define udiv_qrnnd(q, r, nh, nl, d) \
586  __asm__ ("divx %4,%0,%1"						\
587	   : "=g" (q), "=r" (r)						\
588	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
589#define count_leading_zeros(count, x) \
590  __asm__ ("bsch/1 %1,%0"						\
591	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
592#endif
593
594#if defined (__hppa) && W_TYPE_SIZE == 32
595#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
596  __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"			\
597	   : "=r" (sh), "=&r" (sl)					\
598	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
599#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
600  __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"			\
601	   : "=r" (sh), "=&r" (sl)					\
602	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
603#if defined (_PA_RISC1_1)
604#define umul_ppmm(wh, wl, u, v) \
605  do {									\
606    union {UDItype __ll;						\
607	   struct {USItype __h, __l;} __i;				\
608	  } __x;							\
609    __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
610    (wh) = __x.__i.__h;							\
611    (wl) = __x.__i.__l;							\
612  } while (0)
613#define UMUL_TIME 8
614#define UDIV_TIME 60
615#else
616#define UMUL_TIME 40
617#define UDIV_TIME 80
618#endif
619#define count_leading_zeros(count, x) \
620  do {									\
621    USItype __tmp;							\
622    __asm__ (								\
623       "ldi		1,%0\n"						\
624"	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
625"	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
626"	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
627"	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
628"	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
629"	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
630"	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
631"	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
632"	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
633"	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
634"	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
635"	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
636"	extru		%1,30,1,%1	; Extract bit 1.\n"		\
637"	sub		%0,%1,%0	; Subtract it.\n"		\
638	: "=r" (count), "=r" (__tmp) : "1" (x));			\
639  } while (0)
640#endif /* hppa */
641
642/* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
643   (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
644   is just a case of no direct support for 2.0n but treating it like 1.0. */
645#if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
646#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
647  __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"			\
648	   : "=r" (sh), "=&r" (sl)					\
649	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
650#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
651  __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"			\
652	   : "=r" (sh), "=&r" (sl)					\
653	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
654#endif /* hppa */
655
656#if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
657#if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
658#define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
659  do {									\
660/*  if (__builtin_constant_p (bl))					\
661      __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"				\
662	       : "=r" (sh), "=&r" (sl)					\
663	       : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
664    else								\
665*/    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"				\
666	       : "=r" (sh), "=&r" (sl)					\
667	       : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC);	\
668  } while (0)
669#define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
670  do {									\
671/*  if (__builtin_constant_p (bl))					\
672      __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"				\
673	       : "=r" (sh), "=&r" (sl)					\
674	       : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);	\
675    else								\
676*/    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"				\
677	       : "=r" (sh), "=&r" (sl)					\
678	       : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);	\
679  } while (0)
680#if __GMP_GNUC_PREREQ (4,5)
681#define umul_ppmm(xh, xl, m0, m1)					\
682  do {									\
683    union {UDItype __ll;						\
684	   struct {USItype __h, __l;} __i;				\
685	  } __x;							\
686    __x.__ll = (UDItype) (m0) * (UDItype) (m1);				\
687    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
688  } while (0)
689#else
690#if 0
691/* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
692   with a new enough processor pretending we have 32-bit registers.  */
693#define umul_ppmm(xh, xl, m0, m1)					\
694  do {									\
695    union {UDItype __ll;						\
696	   struct {USItype __h, __l;} __i;				\
697	  } __x;							\
698    __asm__ ("mlr\t%0,%2"						\
699	     : "=r" (__x.__ll)						\
700	     : "%0" (m0), "r" (m1));					\
701    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
702  } while (0)
703#else
704#define umul_ppmm(xh, xl, m0, m1)					\
705  do {									\
706  /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
707     DImode for the product, since that would be allocated to a single 64-bit
708     register, whereas mlr uses the low 32-bits of an even-odd register pair.
709  */									\
710    register USItype __r0 __asm__ ("0");				\
711    register USItype __r1 __asm__ ("1") = (m0);				\
712    __asm__ ("mlr\t%0,%3"						\
713	     : "=r" (__r0), "=r" (__r1)					\
714	     : "r" (__r1), "r" (m1));					\
715    (xh) = __r0; (xl) = __r1;						\
716  } while (0)
717#endif /* if 0 */
718#endif
719#if 0
720/* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
721   with a new enough processor pretending we have 32-bit registers.  */
722#define udiv_qrnnd(q, r, n1, n0, d)					\
723  do {									\
724    union {UDItype __ll;						\
725	   struct {USItype __h, __l;} __i;				\
726	  } __x;							\
727    __x.__i.__h = n1; __x.__i.__l = n0;					\
728    __asm__ ("dlr\t%0,%2"						\
729	     : "=r" (__x.__ll)						\
730	     : "0" (__x.__ll), "r" (d));				\
731    (q) = __x.__i.__l; (r) = __x.__i.__h;				\
732  } while (0)
733#else
734#define udiv_qrnnd(q, r, n1, n0, d)					\
735  do {									\
736    register USItype __r0 __asm__ ("0") = (n1);				\
737    register USItype __r1 __asm__ ("1") = (n0);				\
738    __asm__ ("dlr\t%0,%4"						\
739	     : "=r" (__r0), "=r" (__r1)					\
740	     : "r" (__r0), "r" (__r1), "r" (d));			\
741    (q) = __r1; (r) = __r0;						\
742  } while (0)
743#endif /* if 0 */
744#else /* if __zarch__ */
745/* FIXME: this fails if gcc knows about the 64-bit registers.  */
746#define smul_ppmm(xh, xl, m0, m1)					\
747  do {									\
748    union {DItype __ll;							\
749	   struct {USItype __h, __l;} __i;				\
750	  } __x;							\
751    __asm__ ("mr\t%0,%2"						\
752	     : "=r" (__x.__ll)						\
753	     : "%0" (m0), "r" (m1));					\
754    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
755  } while (0)
756/* FIXME: this fails if gcc knows about the 64-bit registers.  */
757#define sdiv_qrnnd(q, r, n1, n0, d)					\
758  do {									\
759    union {DItype __ll;							\
760	   struct {USItype __h, __l;} __i;				\
761	  } __x;							\
762    __x.__i.__h = n1; __x.__i.__l = n0;					\
763    __asm__ ("dr\t%0,%2"						\
764	     : "=r" (__x.__ll)						\
765	     : "0" (__x.__ll), "r" (d));				\
766    (q) = __x.__i.__l; (r) = __x.__i.__h;				\
767  } while (0)
768#endif /* if __zarch__ */
769#endif
770
771#if defined (__s390x__) && W_TYPE_SIZE == 64
772/* We need to cast operands with register constraints, otherwise their types
773   will be assumed to be SImode by gcc.  For these machines, such operations
774   will insert a value into the low 32 bits, and leave the high 32 bits with
775   garbage.  */
776#define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
777  do {									\
778    __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"				\
779	       : "=r" (sh), "=&r" (sl)					\
780	       : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
781		 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
782  } while (0)
783#define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
784  do {									\
785    __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"				\
786	     : "=r" (sh), "=&r" (sl)					\
787	     : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
788	       "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);	\
789  } while (0)
790#define umul_ppmm(xh, xl, m0, m1)					\
791  do {									\
792    union {unsigned int __attribute__ ((mode(TI))) __ll;		\
793	   struct {UDItype __h, __l;} __i;				\
794	  } __x;							\
795    __asm__ ("mlgr\t%0,%2"						\
796	     : "=r" (__x.__ll)						\
797	     : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));		\
798    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
799  } while (0)
800#define udiv_qrnnd(q, r, n1, n0, d)					\
801  do {									\
802    union {unsigned int __attribute__ ((mode(TI))) __ll;		\
803	   struct {UDItype __h, __l;} __i;				\
804	  } __x;							\
805    __x.__i.__h = n1; __x.__i.__l = n0;					\
806    __asm__ ("dlgr\t%0,%2"						\
807	     : "=r" (__x.__ll)						\
808	     : "0" (__x.__ll), "r" ((UDItype)(d)));			\
809    (q) = __x.__i.__l; (r) = __x.__i.__h;				\
810  } while (0)
811#if 0 /* FIXME: Enable for z10 (?) */
812#define count_leading_zeros(cnt, x)					\
813  do {									\
814    union {unsigned int __attribute__ ((mode(TI))) __ll;		\
815	   struct {UDItype __h, __l;} __i;				\
816	  } __clr_cnt;							\
817    __asm__ ("flogr\t%0,%1"						\
818	     : "=r" (__clr_cnt.__ll)					\
819	     : "r" (x) __CLOBBER_CC);					\
820    (cnt) = __clr_cnt.__i.__h;						\
821  } while (0)
822#endif
823#endif
824
825#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
826#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
827  __asm__ ("addl %5,%k1\n\tadcl %3,%k0"					\
828	   : "=r" (sh), "=&r" (sl)					\
829	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
830	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
831#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
832  __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"					\
833	   : "=r" (sh), "=&r" (sl)					\
834	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
835	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
836#define umul_ppmm(w1, w0, u, v) \
837  __asm__ ("mull %3"							\
838	   : "=a" (w0), "=d" (w1)					\
839	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
840#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
841  __asm__ ("divl %4"		     /* stringification in K&R C */	\
842	   : "=a" (q), "=d" (r)						\
843	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
844
845#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
846/* Pentium bsrl takes between 10 and 72 cycles depending where the most
847   significant 1 bit is, hence the use of the following alternatives.  bsfl
848   is slow too, between 18 and 42 depending where the least significant 1
849   bit is, so let the generic count_trailing_zeros below make use of the
850   count_leading_zeros here too.  */
851
852#if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
853/* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
854   cache miss reading from __clz_tab.  For P55 it's favoured over the float
855   below so as to avoid mixing MMX and x87, since the penalty for switching
856   between the two is about 100 cycles.
857
858   The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
859   16, -1 for 8, or 0 otherwise.  This could be written equivalently as
860   follows, but as of gcc 2.95.2 it results in conditional jumps.
861
862       __shift = -(__n < 0x1000000);
863       __shift -= (__n < 0x10000);
864       __shift -= (__n < 0x100);
865
866   The middle two sbbl and cmpl's pair, and with luck something gcc
867   generates might pair with the first cmpl and the last sbbl.  The "32+1"
868   constant could be folded into __clz_tab[], but it doesn't seem worth
869   making a different table just for that.  */
870
871#define count_leading_zeros(c,n)					\
872  do {									\
873    USItype  __n = (n);							\
874    USItype  __shift;							\
875    __asm__ ("cmpl  $0x1000000, %1\n"					\
876	     "sbbl  %0, %0\n"						\
877	     "cmpl  $0x10000, %1\n"					\
878	     "sbbl  $0, %0\n"						\
879	     "cmpl  $0x100, %1\n"					\
880	     "sbbl  $0, %0\n"						\
881	     : "=&r" (__shift) : "r"  (__n));				\
882    __shift = __shift*8 + 24 + 1;					\
883    (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
884  } while (0)
885#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
886#define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
887
888#else /* ! pentiummmx || LONGLONG_STANDALONE */
889/* The following should be a fixed 14 cycles or so.  Some scheduling
890   opportunities should be available between the float load/store too.  This
891   sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
892   apparently suggested by the Intel optimizing manual (don't know exactly
893   where).  gcc 2.95 or up will be best for this, so the "double" is
894   correctly aligned on the stack.  */
895#define count_leading_zeros(c,n)					\
896  do {									\
897    union {								\
898      double    d;							\
899      unsigned  a[2];							\
900    } __u;								\
901    ASSERT ((n) != 0);							\
902    __u.d = (UWtype) (n);						\
903    (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
904  } while (0)
905#define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
906#endif /* pentiummx */
907
908#else /* ! pentium */
909
910#if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
911#define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
912#endif /* gcc clz */
913
914/* On P6, gcc prior to 3.0 generates a partial register stall for
915   __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
916   being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
917   cost of one extra instruction.  Do this for "i386" too, since that means
918   generic x86.  */
919#if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
920  && (HAVE_HOST_CPU_i386						\
921      || HAVE_HOST_CPU_i686						\
922      || HAVE_HOST_CPU_pentiumpro					\
923      || HAVE_HOST_CPU_pentium2						\
924      || HAVE_HOST_CPU_pentium3)
925#define count_leading_zeros(count, x)					\
926  do {									\
927    USItype __cbtmp;							\
928    ASSERT ((x) != 0);							\
929    __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
930    (count) = 31 - __cbtmp;						\
931  } while (0)
932#endif /* gcc<3 asm bsrl */
933
934#ifndef count_leading_zeros
935#define count_leading_zeros(count, x)					\
936  do {									\
937    USItype __cbtmp;							\
938    ASSERT ((x) != 0);							\
939    __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
940    (count) = __cbtmp ^ 31;						\
941  } while (0)
942#endif /* asm bsrl */
943
944#if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
945#define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
946#endif /* gcc ctz */
947
948#ifndef count_trailing_zeros
949#define count_trailing_zeros(count, x)					\
950  do {									\
951    ASSERT ((x) != 0);							\
952    __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));	\
953  } while (0)
954#endif /* asm bsfl */
955
956#endif /* ! pentium */
957
958#ifndef UMUL_TIME
959#define UMUL_TIME 10
960#endif
961#ifndef UDIV_TIME
962#define UDIV_TIME 40
963#endif
964#endif /* 80x86 */
965
966#if defined (__amd64__) && W_TYPE_SIZE == 64
967#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
968  __asm__ ("addq %5,%q1\n\tadcq %3,%q0"					\
969	   : "=r" (sh), "=&r" (sl)					\
970	   : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
971	     "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
972#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
973  __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"					\
974	   : "=r" (sh), "=&r" (sl)					\
975	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
976	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
977#define umul_ppmm(w1, w0, u, v) \
978  __asm__ ("mulq %3"							\
979	   : "=a" (w0), "=d" (w1)					\
980	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
981#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
982  __asm__ ("divq %4"		     /* stringification in K&R C */	\
983	   : "=a" (q), "=d" (r)						\
984	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
985/* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
986#define count_leading_zeros(count, x)					\
987  do {									\
988    UDItype __cbtmp;							\
989    ASSERT ((x) != 0);							\
990    __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
991    (count) = __cbtmp ^ 63;						\
992  } while (0)
993/* bsfq destination must be a 64-bit register, "%q0" forces this in case
994   count is only an int. */
995#define count_trailing_zeros(count, x)					\
996  do {									\
997    ASSERT ((x) != 0);							\
998    __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
999  } while (0)
1000#endif /* x86_64 */
1001
1002#if defined (__i860__) && W_TYPE_SIZE == 32
1003#define rshift_rhlc(r,h,l,c) \
1004  __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
1005	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
1006#endif /* i860 */
1007
1008#if defined (__i960__) && W_TYPE_SIZE == 32
1009#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1010  __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
1011	   : "=r" (sh), "=&r" (sl)					\
1012	   : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1013#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1014  __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
1015	   : "=r" (sh), "=&r" (sl)					\
1016	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1017#define umul_ppmm(w1, w0, u, v) \
1018  ({union {UDItype __ll;						\
1019	   struct {USItype __l, __h;} __i;				\
1020	  } __x;							\
1021  __asm__ ("emul %2,%1,%0"						\
1022	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
1023  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1024#define __umulsidi3(u, v) \
1025  ({UDItype __w;							\
1026    __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
1027    __w; })
1028#define udiv_qrnnd(q, r, nh, nl, d) \
1029  do {									\
1030    union {UDItype __ll;						\
1031	   struct {USItype __l, __h;} __i;				\
1032	  } __nn;							\
1033    __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
1034    __asm__ ("ediv %d,%n,%0"						\
1035	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
1036    (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
1037  } while (0)
1038#define count_leading_zeros(count, x) \
1039  do {									\
1040    USItype __cbtmp;							\
1041    __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
1042    (count) = __cbtmp ^ 31;						\
1043  } while (0)
1044#define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1045#if defined (__i960mx)		/* what is the proper symbol to test??? */
1046#define rshift_rhlc(r,h,l,c) \
1047  do {									\
1048    union {UDItype __ll;						\
1049	   struct {USItype __l, __h;} __i;				\
1050	  } __nn;							\
1051    __nn.__i.__h = (h); __nn.__i.__l = (l);				\
1052    __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
1053  }
1054#endif /* i960mx */
1055#endif /* i960 */
1056
1057#if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1058     || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1059     || defined (__mc5307__)) && W_TYPE_SIZE == 32
1060#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1061  __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
1062	   : "=d" (sh), "=&d" (sl)					\
1063	   : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),			\
1064	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1065#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1066  __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
1067	   : "=d" (sh), "=&d" (sl)					\
1068	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
1069	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1070/* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1071#if defined (__mc68020__) || defined(mc68020) \
1072     || defined (__mc68030__) || defined (mc68030) \
1073     || defined (__mc68040__) || defined (mc68040) \
1074     || defined (__mcpu32__) || defined (mcpu32) \
1075     || defined (__NeXT__)
1076#define umul_ppmm(w1, w0, u, v) \
1077  __asm__ ("mulu%.l %3,%1:%0"						\
1078	   : "=d" (w0), "=d" (w1)					\
1079	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1080#define UMUL_TIME 45
1081#define udiv_qrnnd(q, r, n1, n0, d) \
1082  __asm__ ("divu%.l %4,%1:%0"						\
1083	   : "=d" (q), "=d" (r)						\
1084	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1085#define UDIV_TIME 90
1086#define sdiv_qrnnd(q, r, n1, n0, d) \
1087  __asm__ ("divs%.l %4,%1:%0"						\
1088	   : "=d" (q), "=d" (r)						\
1089	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1090#else /* for other 68k family members use 16x16->32 multiplication */
1091#define umul_ppmm(xh, xl, a, b) \
1092  do { USItype __umul_tmp1, __umul_tmp2;				\
1093	__asm__ ("| Inlined umul_ppmm\n"				\
1094"	move%.l	%5,%3\n"						\
1095"	move%.l	%2,%0\n"						\
1096"	move%.w	%3,%1\n"						\
1097"	swap	%3\n"							\
1098"	swap	%0\n"							\
1099"	mulu%.w	%2,%1\n"						\
1100"	mulu%.w	%3,%0\n"						\
1101"	mulu%.w	%2,%3\n"						\
1102"	swap	%2\n"							\
1103"	mulu%.w	%5,%2\n"						\
1104"	add%.l	%3,%2\n"						\
1105"	jcc	1f\n"							\
1106"	add%.l	%#0x10000,%0\n"						\
1107"1:	move%.l	%2,%3\n"						\
1108"	clr%.w	%2\n"							\
1109"	swap	%2\n"							\
1110"	swap	%3\n"							\
1111"	clr%.w	%3\n"							\
1112"	add%.l	%3,%1\n"						\
1113"	addx%.l	%2,%0\n"						\
1114"	| End inlined umul_ppmm"					\
1115	      : "=&d" (xh), "=&d" (xl),					\
1116		"=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
1117	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
1118  } while (0)
1119#define UMUL_TIME 100
1120#define UDIV_TIME 400
1121#endif /* not mc68020 */
1122/* The '020, '030, '040 and '060 have bitfield insns.
1123   GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1124   exclude bfffo on that chip (bitfield insns not available).  */
1125#if (defined (__mc68020__) || defined (mc68020)    \
1126     || defined (__mc68030__) || defined (mc68030) \
1127     || defined (__mc68040__) || defined (mc68040) \
1128     || defined (__mc68060__) || defined (mc68060) \
1129     || defined (__NeXT__))                        \
1130  && ! defined (__mcpu32__)
1131#define count_leading_zeros(count, x) \
1132  __asm__ ("bfffo %1{%b2:%b2},%0"					\
1133	   : "=d" (count)						\
1134	   : "od" ((USItype) (x)), "n" (0))
1135#define COUNT_LEADING_ZEROS_0 32
1136#endif
1137#endif /* mc68000 */
1138
1139#if defined (__m88000__) && W_TYPE_SIZE == 32
1140#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1141  __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
1142	   : "=r" (sh), "=&r" (sl)					\
1143	   : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1144#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1145  __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
1146	   : "=r" (sh), "=&r" (sl)					\
1147	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1148#define count_leading_zeros(count, x) \
1149  do {									\
1150    USItype __cbtmp;							\
1151    __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
1152    (count) = __cbtmp ^ 31;						\
1153  } while (0)
1154#define COUNT_LEADING_ZEROS_0 63 /* sic */
1155#if defined (__m88110__)
1156#define umul_ppmm(wh, wl, u, v) \
1157  do {									\
1158    union {UDItype __ll;						\
1159	   struct {USItype __h, __l;} __i;				\
1160	  } __x;							\
1161    __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
1162    (wh) = __x.__i.__h;							\
1163    (wl) = __x.__i.__l;							\
1164  } while (0)
1165#define udiv_qrnnd(q, r, n1, n0, d) \
1166  ({union {UDItype __ll;						\
1167	   struct {USItype __h, __l;} __i;				\
1168	  } __x, __q;							\
1169  __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1170  __asm__ ("divu.d %0,%1,%2"						\
1171	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
1172  (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1173#define UMUL_TIME 5
1174#define UDIV_TIME 25
1175#else
1176#define UMUL_TIME 17
1177#define UDIV_TIME 150
1178#endif /* __m88110__ */
1179#endif /* __m88000__ */
1180
1181#if defined (__mips) && W_TYPE_SIZE == 32
1182#if __GMP_GNUC_PREREQ (4,4)
1183#define umul_ppmm(w1, w0, u, v) \
1184  do {									\
1185    UDItype __ll = (UDItype)(u) * (v);					\
1186    w1 = __ll >> 32;							\
1187    w0 = __ll;								\
1188  } while (0)
1189#endif
1190#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1191#define umul_ppmm(w1, w0, u, v) \
1192  __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1193#endif
1194#if !defined (umul_ppmm)
1195#define umul_ppmm(w1, w0, u, v) \
1196  __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1197	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1198#endif
1199#define UMUL_TIME 10
1200#define UDIV_TIME 100
1201#endif /* __mips */
1202
1203#if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1204#if __GMP_GNUC_PREREQ (4,4)
1205#define umul_ppmm(w1, w0, u, v) \
1206  do {									\
1207    typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1208    __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1209    w1 = __ll >> 64;							\
1210    w0 = __ll;								\
1211  } while (0)
1212#endif
1213#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1214#define umul_ppmm(w1, w0, u, v) \
1215  __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1216#endif
1217#if !defined (umul_ppmm)
1218#define umul_ppmm(w1, w0, u, v) \
1219  __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1220	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1221#endif
1222#define UMUL_TIME 20
1223#define UDIV_TIME 140
1224#endif /* __mips */
1225
1226#if defined (__mmix__) && W_TYPE_SIZE == 64
1227#define umul_ppmm(w1, w0, u, v) \
1228  __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1229#endif
1230
1231#if defined (__ns32000__) && W_TYPE_SIZE == 32
1232#define umul_ppmm(w1, w0, u, v) \
1233  ({union {UDItype __ll;						\
1234	   struct {USItype __l, __h;} __i;				\
1235	  } __x;							\
1236  __asm__ ("meid %2,%0"							\
1237	   : "=g" (__x.__ll)						\
1238	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
1239  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1240#define __umulsidi3(u, v) \
1241  ({UDItype __w;							\
1242    __asm__ ("meid %2,%0"						\
1243	     : "=g" (__w)						\
1244	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
1245    __w; })
1246#define udiv_qrnnd(q, r, n1, n0, d) \
1247  ({union {UDItype __ll;						\
1248	   struct {USItype __l, __h;} __i;				\
1249	  } __x;							\
1250  __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1251  __asm__ ("deid %2,%0"							\
1252	   : "=g" (__x.__ll)						\
1253	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
1254  (r) = __x.__i.__l; (q) = __x.__i.__h; })
1255#define count_trailing_zeros(count,x) \
1256  do {									\
1257    __asm__ ("ffsd	%2,%0"						\
1258	     : "=r" (count)						\
1259	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
1260  } while (0)
1261#endif /* __ns32000__ */
1262
1263/* In the past we had a block of various #defines tested
1264       _ARCH_PPC    - AIX
1265       _ARCH_PWR    - AIX
1266       __powerpc__  - gcc
1267       __POWERPC__  - BEOS
1268       __ppc__      - Darwin
1269       PPC          - old gcc, GNU/Linux, SysV
1270   The plain PPC test was not good for vxWorks, since PPC is defined on all
1271   CPUs there (eg. m68k too), as a constant one is expected to compare
1272   CPU_FAMILY against.
1273
1274   At any rate, this was pretty unattractive and a bit fragile.  The use of
1275   HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1276   getting the desired effect.
1277
1278   ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1279   the system vendor compilers.  (Is that vendor compilers with inline asm,
1280   or what?)  */
1281
1282#if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1283  && W_TYPE_SIZE == 32
1284#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1285  do {									\
1286    if (__builtin_constant_p (bh) && (bh) == 0)				\
1287      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
1288	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1289    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1290      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
1291	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1292    else								\
1293      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
1294	     : "=r" (sh), "=&r" (sl)					\
1295	     : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
1296  } while (0)
1297#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1298  do {									\
1299    if (__builtin_constant_p (ah) && (ah) == 0)				\
1300      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	\
1301	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1302    else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
1303      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	\
1304	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1305    else if (__builtin_constant_p (bh) && (bh) == 0)			\
1306      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"		\
1307	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1308    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1309      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"		\
1310	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1311    else								\
1312      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	\
1313	       : "=r" (sh), "=&r" (sl)					\
1314	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
1315  } while (0)
1316#define count_leading_zeros(count, x) \
1317  __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1318#define COUNT_LEADING_ZEROS_0 32
1319#if HAVE_HOST_CPU_FAMILY_powerpc
1320#if __GMP_GNUC_PREREQ (4,4)
1321#define umul_ppmm(w1, w0, u, v) \
1322  do {									\
1323    UDItype __ll = (UDItype)(u) * (v);					\
1324    w1 = __ll >> 32;							\
1325    w0 = __ll;								\
1326  } while (0)
1327#endif
1328#if !defined (umul_ppmm)
1329#define umul_ppmm(ph, pl, m0, m1) \
1330  do {									\
1331    USItype __m0 = (m0), __m1 = (m1);					\
1332    __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1333    (pl) = __m0 * __m1;							\
1334  } while (0)
1335#endif
1336#define UMUL_TIME 15
1337#define smul_ppmm(ph, pl, m0, m1) \
1338  do {									\
1339    SItype __m0 = (m0), __m1 = (m1);					\
1340    __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1341    (pl) = __m0 * __m1;							\
1342  } while (0)
1343#define SMUL_TIME 14
1344#define UDIV_TIME 120
1345#else
1346#define UMUL_TIME 8
1347#define smul_ppmm(xh, xl, m0, m1) \
1348  __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1349#define SMUL_TIME 4
1350#define sdiv_qrnnd(q, r, nh, nl, d) \
1351  __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1352#define UDIV_TIME 100
1353#endif
1354#endif /* 32-bit POWER architecture variants.  */
1355
1356/* We should test _IBMR2 here when we add assembly support for the system
1357   vendor compilers.  */
1358#if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1359#if !defined (_LONG_LONG_LIMB)
1360/* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1361   use adde etc only when not _LONG_LONG_LIMB.  */
1362#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1363  do {									\
1364    if (__builtin_constant_p (bh) && (bh) == 0)				\
1365      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
1366	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1367    else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
1368      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
1369	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1370    else								\
1371      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
1372	     : "=r" (sh), "=&r" (sl)					\
1373	     : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
1374  } while (0)
1375/* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1376   This might seem strange, but gcc folds away the dead code late.  */
1377#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1378  do {									      \
1379    if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {	      \
1380	if (__builtin_constant_p (ah) && (ah) == 0)			      \
1381	  __asm__ ("{ai|addic} %1,%3,%4\n\t{sfze|subfze} %0,%2"		      \
1382		   : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1383	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	      \
1384	  __asm__ ("{ai|addic} %1,%3,%4\n\t{sfme|subfme} %0,%2"		      \
1385		   : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1386	else if (__builtin_constant_p (bh) && (bh) == 0)		      \
1387	  __asm__ ("{ai|addic} %1,%3,%4\n\t{ame|addme} %0,%2"		      \
1388		   : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1389	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	      \
1390	  __asm__ ("{ai|addic} %1,%3,%4\n\t{aze|addze} %0,%2"		      \
1391		   : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1392	else								      \
1393	  __asm__ ("{ai|addic} %1,%4,%5\n\t{sfe|subfe} %0,%3,%2"	      \
1394		   : "=r" (sh), "=&r" (sl)				      \
1395		   : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl));	      \
1396      } else {								      \
1397	if (__builtin_constant_p (ah) && (ah) == 0)			      \
1398	  __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	      \
1399		   : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1400	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	      \
1401	  __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	      \
1402		   : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1403	else if (__builtin_constant_p (bh) && (bh) == 0)		      \
1404	  __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"	      \
1405		   : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1406	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	      \
1407	  __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"	      \
1408		   : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1409	else								      \
1410	  __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	      \
1411		   : "=r" (sh), "=&r" (sl)				      \
1412		   : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		      \
1413      }									      \
1414  } while (0)
1415#endif /* ! _LONG_LONG_LIMB */
1416#define count_leading_zeros(count, x) \
1417  __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1418#define COUNT_LEADING_ZEROS_0 64
1419#if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
1420#define umul_ppmm(w1, w0, u, v) \
1421  do {									\
1422    typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1423    __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1424    w1 = __ll >> 64;							\
1425    w0 = __ll;								\
1426  } while (0)
1427#endif
1428#if !defined (umul_ppmm)
1429#define umul_ppmm(ph, pl, m0, m1) \
1430  do {									\
1431    UDItype __m0 = (m0), __m1 = (m1);					\
1432    __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1433    (pl) = __m0 * __m1;							\
1434  } while (0)
1435#endif
1436#define UMUL_TIME 15
1437#define smul_ppmm(ph, pl, m0, m1) \
1438  do {									\
1439    DItype __m0 = (m0), __m1 = (m1);					\
1440    __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1441    (pl) = __m0 * __m1;							\
1442  } while (0)
1443#define SMUL_TIME 14  /* ??? */
1444#define UDIV_TIME 120 /* ??? */
1445#endif /* 64-bit PowerPC.  */
1446
1447#if defined (__pyr__) && W_TYPE_SIZE == 32
1448#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1449  __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
1450	   : "=r" (sh), "=&r" (sl)					\
1451	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1452	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1453#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1454  __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
1455	   : "=r" (sh), "=&r" (sl)					\
1456	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1457	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1458/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1459#define umul_ppmm(w1, w0, u, v) \
1460  ({union {UDItype __ll;						\
1461	   struct {USItype __h, __l;} __i;				\
1462	  } __x;							\
1463  __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
1464	   : "=&r" (__x.__ll)						\
1465	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
1466  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1467#endif /* __pyr__ */
1468
1469#if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1470#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1471  __asm__ ("a %1,%5\n\tae %0,%3"					\
1472	   : "=r" (sh), "=&r" (sl)					\
1473	   : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),			\
1474	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1475#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1476  __asm__ ("s %1,%5\n\tse %0,%3"					\
1477	   : "=r" (sh), "=&r" (sl)					\
1478	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
1479	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
1480#define smul_ppmm(ph, pl, m0, m1) \
1481  __asm__ (								\
1482       "s	r2,r2\n"						\
1483"	mts r10,%2\n"							\
1484"	m	r2,%3\n"						\
1485"	m	r2,%3\n"						\
1486"	m	r2,%3\n"						\
1487"	m	r2,%3\n"						\
1488"	m	r2,%3\n"						\
1489"	m	r2,%3\n"						\
1490"	m	r2,%3\n"						\
1491"	m	r2,%3\n"						\
1492"	m	r2,%3\n"						\
1493"	m	r2,%3\n"						\
1494"	m	r2,%3\n"						\
1495"	m	r2,%3\n"						\
1496"	m	r2,%3\n"						\
1497"	m	r2,%3\n"						\
1498"	m	r2,%3\n"						\
1499"	m	r2,%3\n"						\
1500"	cas	%0,r2,r0\n"						\
1501"	mfs	r10,%1"							\
1502	   : "=r" (ph), "=r" (pl)					\
1503	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
1504	   : "r2")
1505#define UMUL_TIME 20
1506#define UDIV_TIME 200
1507#define count_leading_zeros(count, x) \
1508  do {									\
1509    if ((x) >= 0x10000)							\
1510      __asm__ ("clz	%0,%1"						\
1511	       : "=r" (count) : "r" ((USItype)(x) >> 16));		\
1512    else								\
1513      {									\
1514	__asm__ ("clz	%0,%1"						\
1515		 : "=r" (count) : "r" ((USItype)(x)));			\
1516	(count) += 16;							\
1517      }									\
1518  } while (0)
1519#endif /* RT/ROMP */
1520
1521#if defined (__sh2__) && W_TYPE_SIZE == 32
1522#define umul_ppmm(w1, w0, u, v) \
1523  __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
1524	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1525#define UMUL_TIME 5
1526#endif
1527
1528#if defined (__sparc__) && W_TYPE_SIZE == 32
1529#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1530  __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
1531	   : "=r" (sh), "=&r" (sl)					\
1532	   : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)			\
1533	   __CLOBBER_CC)
1534#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1535  __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
1536	   : "=r" (sh), "=&r" (sl)					\
1537	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
1538	   __CLOBBER_CC)
1539/* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1540   doesn't define anything to indicate that to us, it only sets __sparcv8. */
1541#if defined (__sparc_v9__) || defined (__sparcv9)
1542/* Perhaps we should use floating-point operations here?  */
1543#if 0
1544/* Triggers a bug making mpz/tests/t-gcd.c fail.
1545   Perhaps we simply need explicitly zero-extend the inputs?  */
1546#define umul_ppmm(w1, w0, u, v) \
1547  __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
1548	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1549#else
1550/* Use v8 umul until above bug is fixed.  */
1551#define umul_ppmm(w1, w0, u, v) \
1552  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1553#endif
1554/* Use a plain v8 divide for v9.  */
1555#define udiv_qrnnd(q, r, n1, n0, d) \
1556  do {									\
1557    USItype __q;							\
1558    __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1559	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1560    (r) = (n0) - __q * (d);						\
1561    (q) = __q;								\
1562  } while (0)
1563#else
1564#if defined (__sparc_v8__)   /* gcc normal */				\
1565  || defined (__sparcv8)     /* gcc solaris */				\
1566  || HAVE_HOST_CPU_supersparc
1567/* Don't match immediate range because, 1) it is not often useful,
1568   2) the 'I' flag thinks of the range as a 13 bit signed interval,
1569   while we want to match a 13 bit interval, sign extended to 32 bits,
1570   but INTERPRETED AS UNSIGNED.  */
1571#define umul_ppmm(w1, w0, u, v) \
1572  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1573#define UMUL_TIME 5
1574
1575#if HAVE_HOST_CPU_supersparc
1576#define UDIV_TIME 60		/* SuperSPARC timing */
1577#else
1578/* Don't use this on SuperSPARC because its udiv only handles 53 bit
1579   dividends and will trap to the kernel for the rest. */
1580#define udiv_qrnnd(q, r, n1, n0, d) \
1581  do {									\
1582    USItype __q;							\
1583    __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1584	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1585    (r) = (n0) - __q * (d);						\
1586    (q) = __q;								\
1587  } while (0)
1588#define UDIV_TIME 25
1589#endif /* HAVE_HOST_CPU_supersparc */
1590
1591#else /* ! __sparc_v8__ */
1592#if defined (__sparclite__)
1593/* This has hardware multiply but not divide.  It also has two additional
1594   instructions scan (ffs from high bit) and divscc.  */
1595#define umul_ppmm(w1, w0, u, v) \
1596  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1597#define UMUL_TIME 5
1598#define udiv_qrnnd(q, r, n1, n0, d) \
1599  __asm__ ("! Inlined udiv_qrnnd\n"					\
1600"	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
1601"	tst	%%g0\n"							\
1602"	divscc	%3,%4,%%g1\n"						\
1603"	divscc	%%g1,%4,%%g1\n"						\
1604"	divscc	%%g1,%4,%%g1\n"						\
1605"	divscc	%%g1,%4,%%g1\n"						\
1606"	divscc	%%g1,%4,%%g1\n"						\
1607"	divscc	%%g1,%4,%%g1\n"						\
1608"	divscc	%%g1,%4,%%g1\n"						\
1609"	divscc	%%g1,%4,%%g1\n"						\
1610"	divscc	%%g1,%4,%%g1\n"						\
1611"	divscc	%%g1,%4,%%g1\n"						\
1612"	divscc	%%g1,%4,%%g1\n"						\
1613"	divscc	%%g1,%4,%%g1\n"						\
1614"	divscc	%%g1,%4,%%g1\n"						\
1615"	divscc	%%g1,%4,%%g1\n"						\
1616"	divscc	%%g1,%4,%%g1\n"						\
1617"	divscc	%%g1,%4,%%g1\n"						\
1618"	divscc	%%g1,%4,%%g1\n"						\
1619"	divscc	%%g1,%4,%%g1\n"						\
1620"	divscc	%%g1,%4,%%g1\n"						\
1621"	divscc	%%g1,%4,%%g1\n"						\
1622"	divscc	%%g1,%4,%%g1\n"						\
1623"	divscc	%%g1,%4,%%g1\n"						\
1624"	divscc	%%g1,%4,%%g1\n"						\
1625"	divscc	%%g1,%4,%%g1\n"						\
1626"	divscc	%%g1,%4,%%g1\n"						\
1627"	divscc	%%g1,%4,%%g1\n"						\
1628"	divscc	%%g1,%4,%%g1\n"						\
1629"	divscc	%%g1,%4,%%g1\n"						\
1630"	divscc	%%g1,%4,%%g1\n"						\
1631"	divscc	%%g1,%4,%%g1\n"						\
1632"	divscc	%%g1,%4,%%g1\n"						\
1633"	divscc	%%g1,%4,%0\n"						\
1634"	rd	%%y,%1\n"						\
1635"	bl,a 1f\n"							\
1636"	add	%1,%4,%1\n"						\
1637"1:	! End of inline udiv_qrnnd"					\
1638	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
1639	   : "%g1" __AND_CLOBBER_CC)
1640#define UDIV_TIME 37
1641#define count_leading_zeros(count, x) \
1642  __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1643/* Early sparclites return 63 for an argument of 0, but they warn that future
1644   implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1645   undefined.  */
1646#endif /* __sparclite__ */
1647#endif /* __sparc_v8__ */
1648#endif /* __sparc_v9__ */
1649/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1650#ifndef umul_ppmm
1651#define umul_ppmm(w1, w0, u, v) \
1652  __asm__ ("! Inlined umul_ppmm\n"					\
1653"	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
1654"	sra	%3,31,%%g2	! Don't move this insn\n"		\
1655"	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
1656"	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
1657"	mulscc	%%g1,%3,%%g1\n"						\
1658"	mulscc	%%g1,%3,%%g1\n"						\
1659"	mulscc	%%g1,%3,%%g1\n"						\
1660"	mulscc	%%g1,%3,%%g1\n"						\
1661"	mulscc	%%g1,%3,%%g1\n"						\
1662"	mulscc	%%g1,%3,%%g1\n"						\
1663"	mulscc	%%g1,%3,%%g1\n"						\
1664"	mulscc	%%g1,%3,%%g1\n"						\
1665"	mulscc	%%g1,%3,%%g1\n"						\
1666"	mulscc	%%g1,%3,%%g1\n"						\
1667"	mulscc	%%g1,%3,%%g1\n"						\
1668"	mulscc	%%g1,%3,%%g1\n"						\
1669"	mulscc	%%g1,%3,%%g1\n"						\
1670"	mulscc	%%g1,%3,%%g1\n"						\
1671"	mulscc	%%g1,%3,%%g1\n"						\
1672"	mulscc	%%g1,%3,%%g1\n"						\
1673"	mulscc	%%g1,%3,%%g1\n"						\
1674"	mulscc	%%g1,%3,%%g1\n"						\
1675"	mulscc	%%g1,%3,%%g1\n"						\
1676"	mulscc	%%g1,%3,%%g1\n"						\
1677"	mulscc	%%g1,%3,%%g1\n"						\
1678"	mulscc	%%g1,%3,%%g1\n"						\
1679"	mulscc	%%g1,%3,%%g1\n"						\
1680"	mulscc	%%g1,%3,%%g1\n"						\
1681"	mulscc	%%g1,%3,%%g1\n"						\
1682"	mulscc	%%g1,%3,%%g1\n"						\
1683"	mulscc	%%g1,%3,%%g1\n"						\
1684"	mulscc	%%g1,%3,%%g1\n"						\
1685"	mulscc	%%g1,%3,%%g1\n"						\
1686"	mulscc	%%g1,%3,%%g1\n"						\
1687"	mulscc	%%g1,%3,%%g1\n"						\
1688"	mulscc	%%g1,%3,%%g1\n"						\
1689"	mulscc	%%g1,0,%%g1\n"						\
1690"	add	%%g1,%%g2,%0\n"						\
1691"	rd	%%y,%1"							\
1692	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
1693	   : "%g1", "%g2" __AND_CLOBBER_CC)
1694#define UMUL_TIME 39		/* 39 instructions */
1695#endif
1696#ifndef udiv_qrnnd
1697#ifndef LONGLONG_STANDALONE
1698#define udiv_qrnnd(q, r, n1, n0, d) \
1699  do { UWtype __r;							\
1700    (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
1701    (r) = __r;								\
1702  } while (0)
1703extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1704#ifndef UDIV_TIME
1705#define UDIV_TIME 140
1706#endif
1707#endif /* LONGLONG_STANDALONE */
1708#endif /* udiv_qrnnd */
1709#endif /* __sparc__ */
1710
1711#if defined (__sparc__) && W_TYPE_SIZE == 64
1712#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1713  __asm__ (								\
1714       "addcc	%r4,%5,%1\n"						\
1715      "	addccc	%r6,%7,%%g0\n"						\
1716      "	addc	%r2,%3,%0"						\
1717	  : "=r" (sh), "=&r" (sl)					\
1718	  : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),		\
1719	    "%rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
1720	   __CLOBBER_CC)
1721#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1722  __asm__ (								\
1723       "subcc	%r4,%5,%1\n"						\
1724      "	subccc	%r6,%7,%%g0\n"						\
1725      "	subc	%r2,%3,%0"						\
1726	  : "=r" (sh), "=&r" (sl)					\
1727	  : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),		\
1728	    "rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
1729	   __CLOBBER_CC)
1730#endif
1731
1732#if defined (__vax__) && W_TYPE_SIZE == 32
1733#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1734  __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
1735	   : "=g" (sh), "=&g" (sl)					\
1736	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1737	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1738#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1739  __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
1740	   : "=g" (sh), "=&g" (sl)					\
1741	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1742	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1743#define smul_ppmm(xh, xl, m0, m1) \
1744  do {									\
1745    union {UDItype __ll;						\
1746	   struct {USItype __l, __h;} __i;				\
1747	  } __x;							\
1748    USItype __m0 = (m0), __m1 = (m1);					\
1749    __asm__ ("emul %1,%2,$0,%0"						\
1750	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
1751    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1752  } while (0)
1753#define sdiv_qrnnd(q, r, n1, n0, d) \
1754  do {									\
1755    union {DItype __ll;							\
1756	   struct {SItype __l, __h;} __i;				\
1757	  } __x;							\
1758    __x.__i.__h = n1; __x.__i.__l = n0;					\
1759    __asm__ ("ediv %3,%2,%0,%1"						\
1760	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
1761  } while (0)
1762#if 0
1763/* FIXME: This instruction appears to be unimplemented on some systems (vax
1764   8800 maybe). */
1765#define count_trailing_zeros(count,x)					\
1766  do {									\
1767    __asm__ ("ffs 0, 31, %1, %0"					\
1768	     : "=g" (count)						\
1769	     : "g" ((USItype) (x)));					\
1770  } while (0)
1771#endif
1772#endif /* __vax__ */
1773
1774#if defined (__z8000__) && W_TYPE_SIZE == 16
1775#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1776  __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
1777	   : "=r" (sh), "=&r" (sl)					\
1778	   : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1779	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1780#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1781  __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
1782	   : "=r" (sh), "=&r" (sl)					\
1783	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1784	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1785#define umul_ppmm(xh, xl, m0, m1) \
1786  do {									\
1787    union {long int __ll;						\
1788	   struct {unsigned int __h, __l;} __i;				\
1789	  } __x;							\
1790    unsigned int __m0 = (m0), __m1 = (m1);				\
1791    __asm__ ("mult	%S0,%H3"					\
1792	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
1793	     : "%1" (m0), "rQR" (m1));					\
1794    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1795    (xh) += ((((signed int) __m0 >> 15) & __m1)				\
1796	     + (((signed int) __m1 >> 15) & __m0));			\
1797  } while (0)
1798#endif /* __z8000__ */
1799
1800#endif /* __GNUC__ */
1801
1802#endif /* NO_ASM */
1803
1804
1805#if !defined (umul_ppmm) && defined (__umulsidi3)
1806#define umul_ppmm(ph, pl, m0, m1) \
1807  {									\
1808    UDWtype __ll = __umulsidi3 (m0, m1);				\
1809    ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
1810    pl = (UWtype) __ll;							\
1811  }
1812#endif
1813
1814#if !defined (__umulsidi3)
1815#define __umulsidi3(u, v) \
1816  ({UWtype __hi, __lo;							\
1817    umul_ppmm (__hi, __lo, u, v);					\
1818    ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1819#endif
1820
1821
1822/* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1823   forms have "reversed" arguments, meaning the pointer is last, which
1824   sometimes allows better parameter passing, in particular on 64-bit
1825   hppa. */
1826
1827#define mpn_umul_ppmm  __MPN(umul_ppmm)
1828extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));
1829
1830#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1831  && ! defined (LONGLONG_STANDALONE)
1832#define umul_ppmm(wh, wl, u, v)						      \
1833  do {									      \
1834    UWtype __umul_ppmm__p0;						      \
1835    (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));      \
1836    (wl) = __umul_ppmm__p0;						      \
1837  } while (0)
1838#endif
1839
1840#define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1841extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));
1842
1843#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r	\
1844  && ! defined (LONGLONG_STANDALONE)
1845#define umul_ppmm(wh, wl, u, v)						      \
1846  do {									      \
1847    UWtype __umul_ppmm__p0;						      \
1848    (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0);    \
1849    (wl) = __umul_ppmm__p0;						      \
1850  } while (0)
1851#endif
1852
1853#define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1854extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1855
1856#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd	\
1857  && ! defined (LONGLONG_STANDALONE)
1858#define udiv_qrnnd(q, r, n1, n0, d)					\
1859  do {									\
1860    UWtype __udiv_qrnnd__r;						\
1861    (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,				\
1862			  (UWtype) (n1), (UWtype) (n0), (UWtype) d);	\
1863    (r) = __udiv_qrnnd__r;						\
1864  } while (0)
1865#endif
1866
1867#define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1868extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));
1869
1870#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r	\
1871  && ! defined (LONGLONG_STANDALONE)
1872#define udiv_qrnnd(q, r, n1, n0, d)					\
1873  do {									\
1874    UWtype __udiv_qrnnd__r;						\
1875    (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,	\
1876			    &__udiv_qrnnd__r);				\
1877    (r) = __udiv_qrnnd__r;						\
1878  } while (0)
1879#endif
1880
1881
1882/* If this machine has no inline assembler, use C macros.  */
1883
1884#if !defined (add_ssaaaa)
1885#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1886  do {									\
1887    UWtype __x;								\
1888    __x = (al) + (bl);							\
1889    (sh) = (ah) + (bh) + (__x < (al));					\
1890    (sl) = __x;								\
1891  } while (0)
1892#endif
1893
1894#if !defined (sub_ddmmss)
1895#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1896  do {									\
1897    UWtype __x;								\
1898    __x = (al) - (bl);							\
1899    (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1900    (sl) = __x;								\
1901  } while (0)
1902#endif
1903
1904/* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1905   smul_ppmm.  */
1906#if !defined (umul_ppmm) && defined (smul_ppmm)
1907#define umul_ppmm(w1, w0, u, v)						\
1908  do {									\
1909    UWtype __w1;							\
1910    UWtype __xm0 = (u), __xm1 = (v);					\
1911    smul_ppmm (__w1, w0, __xm0, __xm1);					\
1912    (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
1913		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
1914  } while (0)
1915#endif
1916
1917/* If we still don't have umul_ppmm, define it using plain C.
1918
1919   For reference, when this code is used for squaring (ie. u and v identical
1920   expressions), gcc recognises __x1 and __x2 are the same and generates 3
1921   multiplies, not 4.  The subsequent additions could be optimized a bit,
1922   but the only place GMP currently uses such a square is mpn_sqr_basecase,
1923   and chips obliged to use this generic C umul will have plenty of worse
1924   performance problems than a couple of extra instructions on the diagonal
1925   of sqr_basecase.  */
1926
1927#if !defined (umul_ppmm)
1928#define umul_ppmm(w1, w0, u, v)						\
1929  do {									\
1930    UWtype __x0, __x1, __x2, __x3;					\
1931    UHWtype __ul, __vl, __uh, __vh;					\
1932    UWtype __u = (u), __v = (v);					\
1933									\
1934    __ul = __ll_lowpart (__u);						\
1935    __uh = __ll_highpart (__u);						\
1936    __vl = __ll_lowpart (__v);						\
1937    __vh = __ll_highpart (__v);						\
1938									\
1939    __x0 = (UWtype) __ul * __vl;					\
1940    __x1 = (UWtype) __ul * __vh;					\
1941    __x2 = (UWtype) __uh * __vl;					\
1942    __x3 = (UWtype) __uh * __vh;					\
1943									\
1944    __x1 += __ll_highpart (__x0);/* this can't give carry */		\
1945    __x1 += __x2;		/* but this indeed can */		\
1946    if (__x1 < __x2)		/* did we get it? */			\
1947      __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
1948									\
1949    (w1) = __x3 + __ll_highpart (__x1);					\
1950    (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
1951  } while (0)
1952#endif
1953
1954/* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1955   exist in one form or another.  */
1956#if !defined (smul_ppmm)
1957#define smul_ppmm(w1, w0, u, v)						\
1958  do {									\
1959    UWtype __w1;							\
1960    UWtype __xm0 = (u), __xm1 = (v);					\
1961    umul_ppmm (__w1, w0, __xm0, __xm1);					\
1962    (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
1963		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
1964  } while (0)
1965#endif
1966
1967/* Define this unconditionally, so it can be used for debugging.  */
1968#define __udiv_qrnnd_c(q, r, n1, n0, d) \
1969  do {									\
1970    UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
1971									\
1972    ASSERT ((d) != 0);							\
1973    ASSERT ((n1) < (d));						\
1974									\
1975    __d1 = __ll_highpart (d);						\
1976    __d0 = __ll_lowpart (d);						\
1977									\
1978    __q1 = (n1) / __d1;							\
1979    __r1 = (n1) - __q1 * __d1;						\
1980    __m = __q1 * __d0;							\
1981    __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
1982    if (__r1 < __m)							\
1983      {									\
1984	__q1--, __r1 += (d);						\
1985	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1986	  if (__r1 < __m)						\
1987	    __q1--, __r1 += (d);					\
1988      }									\
1989    __r1 -= __m;							\
1990									\
1991    __q0 = __r1 / __d1;							\
1992    __r0 = __r1  - __q0 * __d1;						\
1993    __m = __q0 * __d0;							\
1994    __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
1995    if (__r0 < __m)							\
1996      {									\
1997	__q0--, __r0 += (d);						\
1998	if (__r0 >= (d))						\
1999	  if (__r0 < __m)						\
2000	    __q0--, __r0 += (d);					\
2001      }									\
2002    __r0 -= __m;							\
2003									\
2004    (q) = __q1 * __ll_B | __q0;						\
2005    (r) = __r0;								\
2006  } while (0)
2007
2008/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2009   __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2010#if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
2011#define udiv_qrnnd(q, r, nh, nl, d) \
2012  do {									\
2013    UWtype __r;								\
2014    (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
2015    (r) = __r;								\
2016  } while (0)
2017__GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2018#endif
2019
2020/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2021#if !defined (udiv_qrnnd)
2022#define UDIV_NEEDS_NORMALIZATION 1
2023#define udiv_qrnnd __udiv_qrnnd_c
2024#endif
2025
2026#if !defined (count_leading_zeros)
2027#define count_leading_zeros(count, x) \
2028  do {									\
2029    UWtype __xr = (x);							\
2030    UWtype __a;								\
2031									\
2032    if (W_TYPE_SIZE == 32)						\
2033      {									\
2034	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
2035	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
2036	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
2037	  : 3*__BITS4 + 1);						\
2038      }									\
2039    else								\
2040      {									\
2041	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
2042	  if (((__xr >> __a) & 0xff) != 0)				\
2043	    break;							\
2044	++__a;								\
2045      }									\
2046									\
2047    (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
2048  } while (0)
2049/* This version gives a well-defined value for zero. */
2050#define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2051#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2052#endif
2053
2054/* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2055#if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2056#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2057#endif
2058
2059#ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2060extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
2061#endif
2062
2063#if !defined (count_trailing_zeros)
2064/* Define count_trailing_zeros using count_leading_zeros.  The latter might be
2065   defined in asm, but if it is not, the C version above is good enough.  */
2066#define count_trailing_zeros(count, x) \
2067  do {									\
2068    UWtype __ctz_x = (x);						\
2069    UWtype __ctz_c;							\
2070    ASSERT (__ctz_x != 0);						\
2071    count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
2072    (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
2073  } while (0)
2074#endif
2075
2076#ifndef UDIV_NEEDS_NORMALIZATION
2077#define UDIV_NEEDS_NORMALIZATION 0
2078#endif
2079
2080/* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2081   that hence the latter should always be used.  */
2082#ifndef UDIV_PREINV_ALWAYS
2083#define UDIV_PREINV_ALWAYS 0
2084#endif
2085
2086/* Give defaults for UMUL_TIME and UDIV_TIME.  */
2087#ifndef UMUL_TIME
2088#define UMUL_TIME 1
2089#endif
2090
2091#ifndef UDIV_TIME
2092#define UDIV_TIME UMUL_TIME
2093#endif
2094